1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)mat->data; 39 ofree_a = aij->free_a; 40 ofree_ij = aij->free_ij; 41 /* changes the free flags */ 42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43 44 PetscCall(PetscFree(aij->ilen)); 45 PetscCall(PetscFree(aij->imax)); 46 PetscCall(PetscMalloc1(m, &aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->ilen)); 48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49 const PetscInt rnz = i[ii + 1] - i[ii]; 50 aij->nonzerorowcnt += !!rnz; 51 aij->rmax = PetscMax(aij->rmax, rnz); 52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53 } 54 aij->maxnz = i[m]; 55 aij->nz = i[m]; 56 57 if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a)); 58 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j)); 59 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i)); 60 61 aij->i = i; 62 aij->j = j; 63 aij->a = a; 64 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 65 aij->free_a = PETSC_FALSE; 66 aij->free_ij = PETSC_FALSE; 67 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 68 // Always build the diag info when i, j are set 69 PetscFunctionReturn(PETSC_SUCCESS); 70 } 71 72 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 73 { 74 Mat_Product *product = C->product; 75 MatProductAlgorithm alg; 76 PetscBool flg; 77 78 PetscFunctionBegin; 79 if (product) { 80 alg = product->alg; 81 } else { 82 alg = "sorted"; 83 } 84 /* sorted */ 85 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 86 if (flg) { 87 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 88 PetscFunctionReturn(PETSC_SUCCESS); 89 } 90 91 /* scalable */ 92 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 93 if (flg) { 94 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 /* scalable_fast */ 99 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 100 if (flg) { 101 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 /* heap */ 106 PetscCall(PetscStrcmp(alg, "heap", &flg)); 107 if (flg) { 108 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 109 PetscFunctionReturn(PETSC_SUCCESS); 110 } 111 112 /* btheap */ 113 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 114 if (flg) { 115 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 116 PetscFunctionReturn(PETSC_SUCCESS); 117 } 118 119 /* llcondensed */ 120 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 121 if (flg) { 122 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 /* rowmerge */ 127 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 128 if (flg) { 129 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 130 PetscFunctionReturn(PETSC_SUCCESS); 131 } 132 133 #if defined(PETSC_HAVE_HYPRE) 134 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 135 if (flg) { 136 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 #endif 140 141 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 142 } 143 144 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 145 { 146 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 147 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 148 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 149 PetscReal afill; 150 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 151 PetscHMapI ta; 152 PetscBT lnkbt; 153 PetscFreeSpaceList free_space = NULL, current_space = NULL; 154 155 PetscFunctionBegin; 156 /* Get ci and cj */ 157 /* Allocate ci array, arrays for fill computation and */ 158 /* free space for accumulating nonzero column info */ 159 PetscCall(PetscMalloc1(am + 2, &ci)); 160 ci[0] = 0; 161 162 /* create and initialize a linked list */ 163 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 164 MatRowMergeMax_SeqAIJ(b, bm, ta); 165 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 166 PetscCall(PetscHMapIDestroy(&ta)); 167 168 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 169 170 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 171 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 172 173 current_space = free_space; 174 175 /* Determine ci and cj */ 176 for (i = 0; i < am; i++) { 177 anzi = ai[i + 1] - ai[i]; 178 aj = a->j + ai[i]; 179 for (j = 0; j < anzi; j++) { 180 brow = aj[j]; 181 bnzj = bi[brow + 1] - bi[brow]; 182 bj = b->j + bi[brow]; 183 /* add non-zero cols of B into the sorted linked list lnk */ 184 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 185 } 186 /* add possible missing diagonal entry */ 187 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 188 cnzi = lnk[0]; 189 190 /* If free space is not available, make more free space */ 191 /* Double the amount of total space in the list */ 192 if (current_space->local_remaining < cnzi) { 193 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 194 ndouble++; 195 } 196 197 /* Copy data into free space, then initialize lnk */ 198 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 199 200 current_space->array += cnzi; 201 current_space->local_used += cnzi; 202 current_space->local_remaining -= cnzi; 203 204 ci[i + 1] = ci[i] + cnzi; 205 } 206 207 /* Column indices are in the list of free space */ 208 /* Allocate space for cj, initialize cj, and */ 209 /* destroy list of free space and other temporary array(s) */ 210 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 211 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 212 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 213 214 /* put together the new symbolic matrix */ 215 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 216 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 217 218 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 219 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 220 c = (Mat_SeqAIJ *)C->data; 221 c->free_a = PETSC_FALSE; 222 c->free_ij = PETSC_TRUE; 223 c->nonew = 0; 224 225 /* fast, needs non-scalable O(bn) array 'abdense' */ 226 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 227 228 /* set MatInfo */ 229 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 230 if (afill < 1.0) afill = 1.0; 231 C->info.mallocs = ndouble; 232 C->info.fill_ratio_given = fill; 233 C->info.fill_ratio_needed = afill; 234 235 #if defined(PETSC_USE_INFO) 236 if (ci[am]) { 237 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 238 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 239 } else { 240 PetscCall(PetscInfo(C, "Empty matrix product\n")); 241 } 242 #endif 243 PetscFunctionReturn(PETSC_SUCCESS); 244 } 245 246 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 247 { 248 PetscLogDouble flops = 0.0; 249 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 250 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 251 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 252 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 253 PetscInt am = A->rmap->n, cm = C->rmap->n; 254 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 255 PetscScalar *ca, valtmp; 256 PetscScalar *ab_dense; 257 PetscContainer cab_dense; 258 const PetscScalar *aa, *ba, *baj; 259 260 PetscFunctionBegin; 261 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 262 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 263 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 264 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 265 c->a = ca; 266 c->free_a = PETSC_TRUE; 267 } else ca = c->a; 268 269 /* TODO this should be done in the symbolic phase */ 270 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 271 that is hard to eradicate) */ 272 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 273 if (!cab_dense) { 274 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 275 PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault)); 276 } else PetscCall(PetscContainerGetPointer(cab_dense, &ab_dense)); 277 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 278 279 /* clean old values in C */ 280 PetscCall(PetscArrayzero(ca, ci[cm])); 281 /* Traverse A row-wise. */ 282 /* Build the ith row in C by summing over nonzero columns in A, */ 283 /* the rows of B corresponding to nonzeros of A. */ 284 for (i = 0; i < am; i++) { 285 anzi = ai[i + 1] - ai[i]; 286 for (j = 0; j < anzi; j++) { 287 brow = aj[j]; 288 bnzi = bi[brow + 1] - bi[brow]; 289 bjj = PetscSafePointerPlusOffset(bj, bi[brow]); 290 baj = PetscSafePointerPlusOffset(ba, bi[brow]); 291 /* perform dense axpy */ 292 valtmp = aa[j]; 293 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 294 flops += 2 * bnzi; 295 } 296 aj = PetscSafePointerPlusOffset(aj, anzi); 297 aa = PetscSafePointerPlusOffset(aa, anzi); 298 299 cnzi = ci[i + 1] - ci[i]; 300 for (k = 0; k < cnzi; k++) { 301 ca[k] += ab_dense[cj[k]]; 302 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 303 } 304 flops += cnzi; 305 cj = PetscSafePointerPlusOffset(cj, cnzi); 306 ca += cnzi; 307 } 308 #if defined(PETSC_HAVE_DEVICE) 309 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 310 #endif 311 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 312 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 313 PetscCall(PetscLogFlops(flops)); 314 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 315 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 320 { 321 PetscLogDouble flops = 0.0; 322 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 323 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 324 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 325 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 326 PetscInt am = A->rmap->N, cm = C->rmap->N; 327 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 328 PetscScalar *ca = c->a, valtmp; 329 const PetscScalar *aa, *ba, *baj; 330 PetscInt nextb; 331 332 PetscFunctionBegin; 333 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 334 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 335 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 336 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 337 c->a = ca; 338 c->free_a = PETSC_TRUE; 339 } 340 341 /* clean old values in C */ 342 PetscCall(PetscArrayzero(ca, ci[cm])); 343 /* Traverse A row-wise. */ 344 /* Build the ith row in C by summing over nonzero columns in A, */ 345 /* the rows of B corresponding to nonzeros of A. */ 346 for (i = 0; i < am; i++) { 347 anzi = ai[i + 1] - ai[i]; 348 cnzi = ci[i + 1] - ci[i]; 349 for (j = 0; j < anzi; j++) { 350 brow = aj[j]; 351 bnzi = bi[brow + 1] - bi[brow]; 352 bjj = bj + bi[brow]; 353 baj = ba + bi[brow]; 354 /* perform sparse axpy */ 355 valtmp = aa[j]; 356 nextb = 0; 357 for (k = 0; nextb < bnzi; k++) { 358 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 359 ca[k] += valtmp * baj[nextb++]; 360 } 361 } 362 flops += 2 * bnzi; 363 } 364 aj += anzi; 365 aa += anzi; 366 cj += cnzi; 367 ca += cnzi; 368 } 369 #if defined(PETSC_HAVE_DEVICE) 370 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 371 #endif 372 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 373 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 374 PetscCall(PetscLogFlops(flops)); 375 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 376 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 377 PetscFunctionReturn(PETSC_SUCCESS); 378 } 379 380 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 381 { 382 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 383 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 384 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 385 MatScalar *ca; 386 PetscReal afill; 387 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 388 PetscHMapI ta; 389 PetscFreeSpaceList free_space = NULL, current_space = NULL; 390 391 PetscFunctionBegin; 392 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 393 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 394 PetscCall(PetscMalloc1(am + 2, &ci)); 395 ci[0] = 0; 396 397 /* create and initialize a linked list */ 398 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 399 MatRowMergeMax_SeqAIJ(b, bm, ta); 400 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 401 PetscCall(PetscHMapIDestroy(&ta)); 402 403 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 404 405 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 406 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 407 current_space = free_space; 408 409 /* Determine ci and cj */ 410 for (i = 0; i < am; i++) { 411 anzi = ai[i + 1] - ai[i]; 412 aj = a->j + ai[i]; 413 for (j = 0; j < anzi; j++) { 414 brow = aj[j]; 415 bnzj = bi[brow + 1] - bi[brow]; 416 bj = b->j + bi[brow]; 417 /* add non-zero cols of B into the sorted linked list lnk */ 418 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 419 } 420 /* add possible missing diagonal entry */ 421 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 422 cnzi = lnk[1]; 423 424 /* If free space is not available, make more free space */ 425 /* Double the amount of total space in the list */ 426 if (current_space->local_remaining < cnzi) { 427 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 428 ndouble++; 429 } 430 431 /* Copy data into free space, then initialize lnk */ 432 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 433 434 current_space->array += cnzi; 435 current_space->local_used += cnzi; 436 current_space->local_remaining -= cnzi; 437 438 ci[i + 1] = ci[i] + cnzi; 439 } 440 441 /* Column indices are in the list of free space */ 442 /* Allocate space for cj, initialize cj, and */ 443 /* destroy list of free space and other temporary array(s) */ 444 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 445 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 446 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 447 448 /* Allocate space for ca */ 449 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 450 451 /* put together the new symbolic matrix */ 452 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 453 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 454 455 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 456 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 457 c = (Mat_SeqAIJ *)C->data; 458 c->free_a = PETSC_TRUE; 459 c->free_ij = PETSC_TRUE; 460 c->nonew = 0; 461 462 /* slower, less memory */ 463 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 464 465 /* set MatInfo */ 466 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 467 if (afill < 1.0) afill = 1.0; 468 C->info.mallocs = ndouble; 469 C->info.fill_ratio_given = fill; 470 C->info.fill_ratio_needed = afill; 471 472 #if defined(PETSC_USE_INFO) 473 if (ci[am]) { 474 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 475 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 476 } else { 477 PetscCall(PetscInfo(C, "Empty matrix product\n")); 478 } 479 #endif 480 PetscFunctionReturn(PETSC_SUCCESS); 481 } 482 483 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 484 { 485 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 486 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 487 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 488 MatScalar *ca; 489 PetscReal afill; 490 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 491 PetscHMapI ta; 492 PetscFreeSpaceList free_space = NULL, current_space = NULL; 493 494 PetscFunctionBegin; 495 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 496 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 497 PetscCall(PetscMalloc1(am + 2, &ci)); 498 ci[0] = 0; 499 500 /* create and initialize a linked list */ 501 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 502 MatRowMergeMax_SeqAIJ(b, bm, ta); 503 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 504 PetscCall(PetscHMapIDestroy(&ta)); 505 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 506 507 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 508 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 509 current_space = free_space; 510 511 /* Determine ci and cj */ 512 for (i = 0; i < am; i++) { 513 anzi = ai[i + 1] - ai[i]; 514 aj = a->j + ai[i]; 515 for (j = 0; j < anzi; j++) { 516 brow = aj[j]; 517 bnzj = bi[brow + 1] - bi[brow]; 518 bj = b->j + bi[brow]; 519 /* add non-zero cols of B into the sorted linked list lnk */ 520 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 521 } 522 /* add possible missing diagonal entry */ 523 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 524 525 cnzi = lnk[0]; 526 527 /* If free space is not available, make more free space */ 528 /* Double the amount of total space in the list */ 529 if (current_space->local_remaining < cnzi) { 530 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 531 ndouble++; 532 } 533 534 /* Copy data into free space, then initialize lnk */ 535 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 536 537 current_space->array += cnzi; 538 current_space->local_used += cnzi; 539 current_space->local_remaining -= cnzi; 540 541 ci[i + 1] = ci[i] + cnzi; 542 } 543 544 /* Column indices are in the list of free space */ 545 /* Allocate space for cj, initialize cj, and */ 546 /* destroy list of free space and other temporary array(s) */ 547 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 548 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 549 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 550 551 /* Allocate space for ca */ 552 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 553 554 /* put together the new symbolic matrix */ 555 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 556 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 557 558 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 559 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 560 c = (Mat_SeqAIJ *)C->data; 561 c->free_a = PETSC_TRUE; 562 c->free_ij = PETSC_TRUE; 563 c->nonew = 0; 564 565 /* slower, less memory */ 566 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 567 568 /* set MatInfo */ 569 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 570 if (afill < 1.0) afill = 1.0; 571 C->info.mallocs = ndouble; 572 C->info.fill_ratio_given = fill; 573 C->info.fill_ratio_needed = afill; 574 575 #if defined(PETSC_USE_INFO) 576 if (ci[am]) { 577 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 578 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 579 } else { 580 PetscCall(PetscInfo(C, "Empty matrix product\n")); 581 } 582 #endif 583 PetscFunctionReturn(PETSC_SUCCESS); 584 } 585 586 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 589 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 590 PetscInt *ci, *cj, *bb; 591 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 592 PetscReal afill; 593 PetscInt i, j, col, ndouble = 0; 594 PetscFreeSpaceList free_space = NULL, current_space = NULL; 595 PetscHeap h; 596 597 PetscFunctionBegin; 598 /* Get ci and cj - by merging sorted rows using a heap */ 599 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 600 PetscCall(PetscMalloc1(am + 2, &ci)); 601 ci[0] = 0; 602 603 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 604 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 605 current_space = free_space; 606 607 PetscCall(PetscHeapCreate(a->rmax, &h)); 608 PetscCall(PetscMalloc1(a->rmax, &bb)); 609 610 /* Determine ci and cj */ 611 for (i = 0; i < am; i++) { 612 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 613 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 614 ci[i + 1] = ci[i]; 615 /* Populate the min heap */ 616 for (j = 0; j < anzi; j++) { 617 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 618 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 619 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 620 } 621 } 622 /* Pick off the min element, adding it to free space */ 623 PetscCall(PetscHeapPop(h, &j, &col)); 624 while (j >= 0) { 625 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 626 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 627 ndouble++; 628 } 629 *(current_space->array++) = col; 630 current_space->local_used++; 631 current_space->local_remaining--; 632 ci[i + 1]++; 633 634 /* stash if anything else remains in this row of B */ 635 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 636 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 637 PetscInt j2, col2; 638 PetscCall(PetscHeapPeek(h, &j2, &col2)); 639 if (col2 != col) break; 640 PetscCall(PetscHeapPop(h, &j2, &col2)); 641 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 642 } 643 /* Put any stashed elements back into the min heap */ 644 PetscCall(PetscHeapUnstash(h)); 645 PetscCall(PetscHeapPop(h, &j, &col)); 646 } 647 } 648 PetscCall(PetscFree(bb)); 649 PetscCall(PetscHeapDestroy(&h)); 650 651 /* Column indices are in the list of free space */ 652 /* Allocate space for cj, initialize cj, and */ 653 /* destroy list of free space and other temporary array(s) */ 654 PetscCall(PetscMalloc1(ci[am], &cj)); 655 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 656 657 /* put together the new symbolic matrix */ 658 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 659 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 660 661 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 662 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 663 c = (Mat_SeqAIJ *)C->data; 664 c->free_a = PETSC_TRUE; 665 c->free_ij = PETSC_TRUE; 666 c->nonew = 0; 667 668 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 669 670 /* set MatInfo */ 671 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 672 if (afill < 1.0) afill = 1.0; 673 C->info.mallocs = ndouble; 674 C->info.fill_ratio_given = fill; 675 C->info.fill_ratio_needed = afill; 676 677 #if defined(PETSC_USE_INFO) 678 if (ci[am]) { 679 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 680 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 681 } else { 682 PetscCall(PetscInfo(C, "Empty matrix product\n")); 683 } 684 #endif 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 689 { 690 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 691 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 692 PetscInt *ci, *cj, *bb; 693 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 694 PetscReal afill; 695 PetscInt i, j, col, ndouble = 0; 696 PetscFreeSpaceList free_space = NULL, current_space = NULL; 697 PetscHeap h; 698 PetscBT bt; 699 700 PetscFunctionBegin; 701 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 702 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 703 PetscCall(PetscMalloc1(am + 2, &ci)); 704 ci[0] = 0; 705 706 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 707 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 708 709 current_space = free_space; 710 711 PetscCall(PetscHeapCreate(a->rmax, &h)); 712 PetscCall(PetscMalloc1(a->rmax, &bb)); 713 PetscCall(PetscBTCreate(bn, &bt)); 714 715 /* Determine ci and cj */ 716 for (i = 0; i < am; i++) { 717 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 718 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 719 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 720 ci[i + 1] = ci[i]; 721 /* Populate the min heap */ 722 for (j = 0; j < anzi; j++) { 723 PetscInt brow = acol[j]; 724 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 725 PetscInt bcol = bj[bb[j]]; 726 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 727 PetscCall(PetscHeapAdd(h, j, bcol)); 728 bb[j]++; 729 break; 730 } 731 } 732 } 733 /* Pick off the min element, adding it to free space */ 734 PetscCall(PetscHeapPop(h, &j, &col)); 735 while (j >= 0) { 736 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 737 fptr = NULL; /* need PetscBTMemzero */ 738 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 739 ndouble++; 740 } 741 *(current_space->array++) = col; 742 current_space->local_used++; 743 current_space->local_remaining--; 744 ci[i + 1]++; 745 746 /* stash if anything else remains in this row of B */ 747 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 748 PetscInt bcol = bj[bb[j]]; 749 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 750 PetscCall(PetscHeapAdd(h, j, bcol)); 751 bb[j]++; 752 break; 753 } 754 } 755 PetscCall(PetscHeapPop(h, &j, &col)); 756 } 757 if (fptr) { /* Clear the bits for this row */ 758 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 759 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 760 PetscCall(PetscBTMemzero(bn, bt)); 761 } 762 } 763 PetscCall(PetscFree(bb)); 764 PetscCall(PetscHeapDestroy(&h)); 765 PetscCall(PetscBTDestroy(&bt)); 766 767 /* Column indices are in the list of free space */ 768 /* Allocate space for cj, initialize cj, and */ 769 /* destroy list of free space and other temporary array(s) */ 770 PetscCall(PetscMalloc1(ci[am], &cj)); 771 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 772 773 /* put together the new symbolic matrix */ 774 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 775 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 776 777 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 778 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 779 c = (Mat_SeqAIJ *)C->data; 780 c->free_a = PETSC_TRUE; 781 c->free_ij = PETSC_TRUE; 782 c->nonew = 0; 783 784 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 785 786 /* set MatInfo */ 787 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 788 if (afill < 1.0) afill = 1.0; 789 C->info.mallocs = ndouble; 790 C->info.fill_ratio_given = fill; 791 C->info.fill_ratio_needed = afill; 792 793 #if defined(PETSC_USE_INFO) 794 if (ci[am]) { 795 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 796 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 797 } else { 798 PetscCall(PetscInfo(C, "Empty matrix product\n")); 799 } 800 #endif 801 PetscFunctionReturn(PETSC_SUCCESS); 802 } 803 804 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 805 { 806 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 807 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 808 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 809 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 810 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 811 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 812 const PetscInt *brow_ptr[8], *brow_end[8]; 813 PetscInt window[8]; 814 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 815 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 816 PetscReal afill; 817 PetscInt *workj_L1, *workj_L2, *workj_L3; 818 PetscInt L1_nnz, L2_nnz; 819 820 /* Step 1: Get upper bound on memory required for allocation. 821 Because of the way virtual memory works, 822 only the memory pages that are actually needed will be physically allocated. */ 823 PetscFunctionBegin; 824 PetscCall(PetscMalloc1(am + 1, &ci)); 825 for (i = 0; i < am; i++) { 826 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 827 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 828 a_rownnz = 0; 829 for (k = 0; k < anzi; ++k) { 830 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 831 if (a_rownnz > bn) { 832 a_rownnz = bn; 833 break; 834 } 835 } 836 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 837 } 838 /* temporary work areas for merging rows */ 839 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 840 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 841 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 842 843 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 844 c_maxmem = 8 * (ai[am] + bi[bm]); 845 /* Step 2: Populate pattern for C */ 846 PetscCall(PetscMalloc1(c_maxmem, &cj)); 847 848 ci_nnz = 0; 849 ci[0] = 0; 850 worki_L1[0] = 0; 851 worki_L2[0] = 0; 852 for (i = 0; i < am; i++) { 853 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 854 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 855 rowsleft = anzi; 856 inputcol_L1 = acol; 857 L2_nnz = 0; 858 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 859 worki_L2[1] = 0; 860 outputi_nnz = 0; 861 862 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 863 while (ci_nnz + a_maxrownnz > c_maxmem) { 864 c_maxmem *= 2; 865 ndouble++; 866 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 867 } 868 869 while (rowsleft) { 870 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 871 L1_nrows = 0; 872 L1_nnz = 0; 873 inputcol = inputcol_L1; 874 inputi = bi; 875 inputj = bj; 876 877 /* The following macro is used to specialize for small rows in A. 878 This helps with compiler unrolling, improving performance substantially. 879 Input: inputj inputi inputcol bn 880 Output: outputj outputi_nnz */ 881 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 882 do { \ 883 window_min = bn; \ 884 outputi_nnz = 0; \ 885 for (k = 0; k < ANNZ; ++k) { \ 886 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 887 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 888 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 889 window_min = PetscMin(window[k], window_min); \ 890 } \ 891 while (window_min < bn) { \ 892 outputj[outputi_nnz++] = window_min; \ 893 /* advance front and compute new minimum */ \ 894 old_window_min = window_min; \ 895 window_min = bn; \ 896 for (k = 0; k < ANNZ; ++k) { \ 897 if (window[k] == old_window_min) { \ 898 brow_ptr[k]++; \ 899 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 900 } \ 901 window_min = PetscMin(window[k], window_min); \ 902 } \ 903 } \ 904 } while (0) 905 906 /************** L E V E L 1 ***************/ 907 /* Merge up to 8 rows of B to L1 work array*/ 908 while (L1_rowsleft) { 909 outputi_nnz = 0; 910 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 911 else outputj = cj + ci_nnz; /* Merge directly to C */ 912 913 switch (L1_rowsleft) { 914 case 1: 915 brow_ptr[0] = inputj + inputi[inputcol[0]]; 916 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 917 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 918 inputcol += L1_rowsleft; 919 rowsleft -= L1_rowsleft; 920 L1_rowsleft = 0; 921 break; 922 case 2: 923 MatMatMultSymbolic_RowMergeMacro(2); 924 inputcol += L1_rowsleft; 925 rowsleft -= L1_rowsleft; 926 L1_rowsleft = 0; 927 break; 928 case 3: 929 MatMatMultSymbolic_RowMergeMacro(3); 930 inputcol += L1_rowsleft; 931 rowsleft -= L1_rowsleft; 932 L1_rowsleft = 0; 933 break; 934 case 4: 935 MatMatMultSymbolic_RowMergeMacro(4); 936 inputcol += L1_rowsleft; 937 rowsleft -= L1_rowsleft; 938 L1_rowsleft = 0; 939 break; 940 case 5: 941 MatMatMultSymbolic_RowMergeMacro(5); 942 inputcol += L1_rowsleft; 943 rowsleft -= L1_rowsleft; 944 L1_rowsleft = 0; 945 break; 946 case 6: 947 MatMatMultSymbolic_RowMergeMacro(6); 948 inputcol += L1_rowsleft; 949 rowsleft -= L1_rowsleft; 950 L1_rowsleft = 0; 951 break; 952 case 7: 953 MatMatMultSymbolic_RowMergeMacro(7); 954 inputcol += L1_rowsleft; 955 rowsleft -= L1_rowsleft; 956 L1_rowsleft = 0; 957 break; 958 default: 959 MatMatMultSymbolic_RowMergeMacro(8); 960 inputcol += 8; 961 rowsleft -= 8; 962 L1_rowsleft -= 8; 963 break; 964 } 965 inputcol_L1 = inputcol; 966 L1_nnz += outputi_nnz; 967 worki_L1[++L1_nrows] = L1_nnz; 968 } 969 970 /********************** L E V E L 2 ************************/ 971 /* Merge from L1 work array to either C or to L2 work array */ 972 if (anzi > 8) { 973 inputi = worki_L1; 974 inputj = workj_L1; 975 inputcol = workcol; 976 outputi_nnz = 0; 977 978 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 979 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 980 981 switch (L1_nrows) { 982 case 1: 983 brow_ptr[0] = inputj + inputi[inputcol[0]]; 984 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 985 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 986 break; 987 case 2: 988 MatMatMultSymbolic_RowMergeMacro(2); 989 break; 990 case 3: 991 MatMatMultSymbolic_RowMergeMacro(3); 992 break; 993 case 4: 994 MatMatMultSymbolic_RowMergeMacro(4); 995 break; 996 case 5: 997 MatMatMultSymbolic_RowMergeMacro(5); 998 break; 999 case 6: 1000 MatMatMultSymbolic_RowMergeMacro(6); 1001 break; 1002 case 7: 1003 MatMatMultSymbolic_RowMergeMacro(7); 1004 break; 1005 case 8: 1006 MatMatMultSymbolic_RowMergeMacro(8); 1007 break; 1008 default: 1009 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1010 } 1011 L2_nnz += outputi_nnz; 1012 worki_L2[++L2_nrows] = L2_nnz; 1013 1014 /************************ L E V E L 3 **********************/ 1015 /* Merge from L2 work array to either C or to L2 work array */ 1016 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1017 inputi = worki_L2; 1018 inputj = workj_L2; 1019 inputcol = workcol; 1020 outputi_nnz = 0; 1021 if (rowsleft) outputj = workj_L3; 1022 else outputj = cj + ci_nnz; 1023 switch (L2_nrows) { 1024 case 1: 1025 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1026 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1027 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1028 break; 1029 case 2: 1030 MatMatMultSymbolic_RowMergeMacro(2); 1031 break; 1032 case 3: 1033 MatMatMultSymbolic_RowMergeMacro(3); 1034 break; 1035 case 4: 1036 MatMatMultSymbolic_RowMergeMacro(4); 1037 break; 1038 case 5: 1039 MatMatMultSymbolic_RowMergeMacro(5); 1040 break; 1041 case 6: 1042 MatMatMultSymbolic_RowMergeMacro(6); 1043 break; 1044 case 7: 1045 MatMatMultSymbolic_RowMergeMacro(7); 1046 break; 1047 case 8: 1048 MatMatMultSymbolic_RowMergeMacro(8); 1049 break; 1050 default: 1051 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1052 } 1053 L2_nrows = 1; 1054 L2_nnz = outputi_nnz; 1055 worki_L2[1] = outputi_nnz; 1056 /* Copy to workj_L2 */ 1057 if (rowsleft) { 1058 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1059 } 1060 } 1061 } 1062 } /* while (rowsleft) */ 1063 #undef MatMatMultSymbolic_RowMergeMacro 1064 1065 /* terminate current row */ 1066 ci_nnz += outputi_nnz; 1067 ci[i + 1] = ci_nnz; 1068 } 1069 1070 /* Step 3: Create the new symbolic matrix */ 1071 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1072 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1073 1074 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1075 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1076 c = (Mat_SeqAIJ *)C->data; 1077 c->free_a = PETSC_TRUE; 1078 c->free_ij = PETSC_TRUE; 1079 c->nonew = 0; 1080 1081 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1082 1083 /* set MatInfo */ 1084 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1085 if (afill < 1.0) afill = 1.0; 1086 C->info.mallocs = ndouble; 1087 C->info.fill_ratio_given = fill; 1088 C->info.fill_ratio_needed = afill; 1089 1090 #if defined(PETSC_USE_INFO) 1091 if (ci[am]) { 1092 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1093 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1094 } else { 1095 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1096 } 1097 #endif 1098 1099 /* Step 4: Free temporary work areas */ 1100 PetscCall(PetscFree(workj_L1)); 1101 PetscCall(PetscFree(workj_L2)); 1102 PetscCall(PetscFree(workj_L3)); 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 /* concatenate unique entries and then sort */ 1107 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1108 { 1109 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1110 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1111 PetscInt *ci, *cj, bcol; 1112 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1113 PetscReal afill; 1114 PetscInt i, j, ndouble = 0; 1115 PetscSegBuffer seg, segrow; 1116 char *seen; 1117 1118 PetscFunctionBegin; 1119 PetscCall(PetscMalloc1(am + 1, &ci)); 1120 ci[0] = 0; 1121 1122 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1123 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1124 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1125 PetscCall(PetscCalloc1(bn, &seen)); 1126 1127 /* Determine ci and cj */ 1128 for (i = 0; i < am; i++) { 1129 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1130 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1131 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1132 1133 /* Pack segrow */ 1134 for (j = 0; j < anzi; j++) { 1135 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1136 for (k = bjstart; k < bjend; k++) { 1137 bcol = bj[k]; 1138 if (!seen[bcol]) { /* new entry */ 1139 PetscInt *PETSC_RESTRICT slot; 1140 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1141 *slot = bcol; 1142 seen[bcol] = 1; 1143 packlen++; 1144 } 1145 } 1146 } 1147 1148 /* Check i-th diagonal entry */ 1149 if (C->force_diagonals && !seen[i]) { 1150 PetscInt *PETSC_RESTRICT slot; 1151 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1152 *slot = i; 1153 seen[i] = 1; 1154 packlen++; 1155 } 1156 1157 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1158 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1159 PetscCall(PetscSortInt(packlen, crow)); 1160 ci[i + 1] = ci[i] + packlen; 1161 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1162 } 1163 PetscCall(PetscSegBufferDestroy(&segrow)); 1164 PetscCall(PetscFree(seen)); 1165 1166 /* Column indices are in the segmented buffer */ 1167 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1168 PetscCall(PetscSegBufferDestroy(&seg)); 1169 1170 /* put together the new symbolic matrix */ 1171 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1172 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1173 1174 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1175 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1176 c = (Mat_SeqAIJ *)C->data; 1177 c->free_a = PETSC_TRUE; 1178 c->free_ij = PETSC_TRUE; 1179 c->nonew = 0; 1180 1181 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1182 1183 /* set MatInfo */ 1184 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1185 if (afill < 1.0) afill = 1.0; 1186 C->info.mallocs = ndouble; 1187 C->info.fill_ratio_given = fill; 1188 C->info.fill_ratio_needed = afill; 1189 1190 #if defined(PETSC_USE_INFO) 1191 if (ci[am]) { 1192 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1193 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1194 } else { 1195 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1196 } 1197 #endif 1198 PetscFunctionReturn(PETSC_SUCCESS); 1199 } 1200 1201 static PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(PetscCtxRt data) 1202 { 1203 MatProductCtx_MatMatTransMult *abt = *(MatProductCtx_MatMatTransMult **)data; 1204 1205 PetscFunctionBegin; 1206 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1207 PetscCall(MatDestroy(&abt->Bt_den)); 1208 PetscCall(MatDestroy(&abt->ABt_den)); 1209 PetscCall(PetscFree(abt)); 1210 PetscFunctionReturn(PETSC_SUCCESS); 1211 } 1212 1213 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1214 { 1215 Mat Bt; 1216 MatProductCtx_MatMatTransMult *abt; 1217 Mat_Product *product = C->product; 1218 char *alg; 1219 1220 PetscFunctionBegin; 1221 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1222 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1223 1224 /* create symbolic Bt */ 1225 PetscCall(MatTransposeSymbolic(B, &Bt)); 1226 PetscCall(MatSetBlockSizes(Bt, A->cmap->bs, B->cmap->bs)); 1227 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1228 1229 /* get symbolic C=A*Bt */ 1230 PetscCall(PetscStrallocpy(product->alg, &alg)); 1231 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1232 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1233 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1234 PetscCall(PetscFree(alg)); 1235 1236 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1237 PetscCall(PetscNew(&abt)); 1238 1239 product->data = abt; 1240 product->destroy = MatProductCtxDestroy_SeqAIJ_MatMatMultTrans; 1241 1242 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1243 1244 abt->usecoloring = PETSC_FALSE; 1245 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1246 if (abt->usecoloring) { 1247 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1248 MatTransposeColoring matcoloring; 1249 MatColoring coloring; 1250 ISColoring iscoloring; 1251 Mat Bt_dense, C_dense; 1252 1253 /* inode causes memory problem */ 1254 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1255 1256 PetscCall(MatColoringCreate(C, &coloring)); 1257 PetscCall(MatColoringSetDistance(coloring, 2)); 1258 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1259 PetscCall(MatColoringSetFromOptions(coloring)); 1260 PetscCall(MatColoringApply(coloring, &iscoloring)); 1261 PetscCall(MatColoringDestroy(&coloring)); 1262 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1263 1264 abt->matcoloring = matcoloring; 1265 1266 PetscCall(ISColoringDestroy(&iscoloring)); 1267 1268 /* Create Bt_dense and C_dense = A*Bt_dense */ 1269 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1270 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1271 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1272 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1273 1274 Bt_dense->assembled = PETSC_TRUE; 1275 abt->Bt_den = Bt_dense; 1276 1277 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1278 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1279 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1280 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1281 1282 Bt_dense->assembled = PETSC_TRUE; 1283 abt->ABt_den = C_dense; 1284 1285 #if defined(PETSC_USE_INFO) 1286 { 1287 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1288 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1289 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1290 } 1291 #endif 1292 } 1293 /* clean up */ 1294 PetscCall(MatDestroy(&Bt)); 1295 PetscFunctionReturn(PETSC_SUCCESS); 1296 } 1297 1298 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1299 { 1300 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1301 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1302 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1303 PetscLogDouble flops = 0.0; 1304 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1305 MatProductCtx_MatMatTransMult *abt; 1306 Mat_Product *product = C->product; 1307 1308 PetscFunctionBegin; 1309 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1310 abt = (MatProductCtx_MatMatTransMult *)product->data; 1311 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1312 /* clear old values in C */ 1313 if (!c->a) { 1314 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1315 c->a = ca; 1316 c->free_a = PETSC_TRUE; 1317 } else { 1318 ca = c->a; 1319 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1320 } 1321 1322 if (abt->usecoloring) { 1323 MatTransposeColoring matcoloring = abt->matcoloring; 1324 Mat Bt_dense, C_dense = abt->ABt_den; 1325 1326 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1327 Bt_dense = abt->Bt_den; 1328 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1329 1330 /* C_dense = A*Bt_dense */ 1331 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1332 1333 /* Recover C from C_dense */ 1334 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1335 PetscFunctionReturn(PETSC_SUCCESS); 1336 } 1337 1338 for (i = 0; i < cm; i++) { 1339 anzi = ai[i + 1] - ai[i]; 1340 acol = PetscSafePointerPlusOffset(aj, ai[i]); 1341 aval = PetscSafePointerPlusOffset(aa, ai[i]); 1342 cnzi = ci[i + 1] - ci[i]; 1343 ccol = PetscSafePointerPlusOffset(cj, ci[i]); 1344 cval = ca + ci[i]; 1345 for (j = 0; j < cnzi; j++) { 1346 brow = ccol[j]; 1347 bnzj = bi[brow + 1] - bi[brow]; 1348 bcol = bj + bi[brow]; 1349 bval = ba + bi[brow]; 1350 1351 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1352 nexta = 0; 1353 nextb = 0; 1354 while (nexta < anzi && nextb < bnzj) { 1355 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1356 if (nexta == anzi) break; 1357 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1358 if (nextb == bnzj) break; 1359 if (acol[nexta] == bcol[nextb]) { 1360 cval[j] += aval[nexta] * bval[nextb]; 1361 nexta++; 1362 nextb++; 1363 flops += 2; 1364 } 1365 } 1366 } 1367 } 1368 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1369 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1370 PetscCall(PetscLogFlops(flops)); 1371 PetscFunctionReturn(PETSC_SUCCESS); 1372 } 1373 1374 PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatTransMatMult(PetscCtxRt data) 1375 { 1376 MatProductCtx_MatTransMatMult *atb = *(MatProductCtx_MatTransMatMult **)data; 1377 1378 PetscFunctionBegin; 1379 PetscCall(MatDestroy(&atb->At)); 1380 if (atb->destroy) PetscCall((*atb->destroy)(&atb->data)); 1381 PetscCall(PetscFree(atb)); 1382 PetscFunctionReturn(PETSC_SUCCESS); 1383 } 1384 1385 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1386 { 1387 Mat At = NULL; 1388 Mat_Product *product = C->product; 1389 PetscBool flg, def, square; 1390 1391 PetscFunctionBegin; 1392 MatCheckProduct(C, 4); 1393 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1394 /* outerproduct */ 1395 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1396 if (flg) { 1397 /* create symbolic At */ 1398 if (!square) { 1399 PetscCall(MatTransposeSymbolic(A, &At)); 1400 PetscCall(MatSetBlockSizes(At, A->cmap->bs, B->cmap->bs)); 1401 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1402 } 1403 /* get symbolic C=At*B */ 1404 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1405 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1406 1407 /* clean up */ 1408 if (!square) PetscCall(MatDestroy(&At)); 1409 1410 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1411 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1412 PetscFunctionReturn(PETSC_SUCCESS); 1413 } 1414 1415 /* matmatmult */ 1416 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1417 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1418 if (flg || def) { 1419 MatProductCtx_MatTransMatMult *atb; 1420 1421 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1422 PetscCall(PetscNew(&atb)); 1423 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1424 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1425 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1426 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1427 product->data = atb; 1428 product->destroy = MatProductCtxDestroy_SeqAIJ_MatTransMatMult; 1429 atb->At = At; 1430 1431 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1432 PetscFunctionReturn(PETSC_SUCCESS); 1433 } 1434 1435 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1436 } 1437 1438 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1439 { 1440 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1441 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1442 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1443 PetscLogDouble flops = 0.0; 1444 MatScalar *aa = a->a, *ba, *ca, *caj; 1445 1446 PetscFunctionBegin; 1447 if (!c->a) { 1448 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1449 1450 c->a = ca; 1451 c->free_a = PETSC_TRUE; 1452 } else { 1453 ca = c->a; 1454 PetscCall(PetscArrayzero(ca, ci[cm])); 1455 } 1456 1457 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1458 for (i = 0; i < am; i++) { 1459 bj = b->j + bi[i]; 1460 ba = b->a + bi[i]; 1461 bnzi = bi[i + 1] - bi[i]; 1462 anzi = ai[i + 1] - ai[i]; 1463 for (j = 0; j < anzi; j++) { 1464 nextb = 0; 1465 crow = *aj++; 1466 cjj = cj + ci[crow]; 1467 caj = ca + ci[crow]; 1468 /* perform sparse axpy operation. Note cjj includes bj. */ 1469 for (k = 0; nextb < bnzi; k++) { 1470 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1471 caj[k] += (*aa) * (*(ba + nextb)); 1472 nextb++; 1473 } 1474 } 1475 flops += 2 * bnzi; 1476 aa++; 1477 } 1478 } 1479 1480 /* Assemble the final matrix and clean up */ 1481 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1482 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1483 PetscCall(PetscLogFlops(flops)); 1484 PetscFunctionReturn(PETSC_SUCCESS); 1485 } 1486 1487 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1488 { 1489 PetscFunctionBegin; 1490 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1491 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1492 PetscFunctionReturn(PETSC_SUCCESS); 1493 } 1494 1495 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1496 { 1497 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1498 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1499 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1500 const PetscInt *aj; 1501 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1502 PetscInt clda; 1503 PetscInt am4, bm4, col, i, j, n; 1504 1505 PetscFunctionBegin; 1506 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1507 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1508 if (add) { 1509 PetscCall(MatDenseGetArray(C, &c)); 1510 } else { 1511 PetscCall(MatDenseGetArrayWrite(C, &c)); 1512 } 1513 PetscCall(MatDenseGetArrayRead(B, &b)); 1514 PetscCall(MatDenseGetLDA(B, &bm)); 1515 PetscCall(MatDenseGetLDA(C, &clda)); 1516 am4 = 4 * clda; 1517 bm4 = 4 * bm; 1518 if (b) { 1519 b1 = b; 1520 b2 = b1 + bm; 1521 b3 = b2 + bm; 1522 b4 = b3 + bm; 1523 } else b1 = b2 = b3 = b4 = NULL; 1524 c1 = c; 1525 c2 = c1 + clda; 1526 c3 = c2 + clda; 1527 c4 = c3 + clda; 1528 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1529 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1530 r1 = r2 = r3 = r4 = 0.0; 1531 n = a->i[i + 1] - a->i[i]; 1532 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1533 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1534 for (j = 0; j < n; j++) { 1535 const PetscScalar aatmp = aa[j]; 1536 const PetscInt ajtmp = aj[j]; 1537 r1 += aatmp * b1[ajtmp]; 1538 r2 += aatmp * b2[ajtmp]; 1539 r3 += aatmp * b3[ajtmp]; 1540 r4 += aatmp * b4[ajtmp]; 1541 } 1542 if (add) { 1543 c1[i] += r1; 1544 c2[i] += r2; 1545 c3[i] += r3; 1546 c4[i] += r4; 1547 } else { 1548 c1[i] = r1; 1549 c2[i] = r2; 1550 c3[i] = r3; 1551 c4[i] = r4; 1552 } 1553 } 1554 if (b) { 1555 b1 += bm4; 1556 b2 += bm4; 1557 b3 += bm4; 1558 b4 += bm4; 1559 } 1560 c1 += am4; 1561 c2 += am4; 1562 c3 += am4; 1563 c4 += am4; 1564 } 1565 /* process remaining columns */ 1566 if (col != cn) { 1567 PetscInt rc = cn - col; 1568 1569 if (rc == 1) { 1570 for (i = 0; i < am; i++) { 1571 r1 = 0.0; 1572 n = a->i[i + 1] - a->i[i]; 1573 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1574 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1575 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1576 if (add) c1[i] += r1; 1577 else c1[i] = r1; 1578 } 1579 } else if (rc == 2) { 1580 for (i = 0; i < am; i++) { 1581 r1 = r2 = 0.0; 1582 n = a->i[i + 1] - a->i[i]; 1583 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1584 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1585 for (j = 0; j < n; j++) { 1586 const PetscScalar aatmp = aa[j]; 1587 const PetscInt ajtmp = aj[j]; 1588 r1 += aatmp * b1[ajtmp]; 1589 r2 += aatmp * b2[ajtmp]; 1590 } 1591 if (add) { 1592 c1[i] += r1; 1593 c2[i] += r2; 1594 } else { 1595 c1[i] = r1; 1596 c2[i] = r2; 1597 } 1598 } 1599 } else { 1600 for (i = 0; i < am; i++) { 1601 r1 = r2 = r3 = 0.0; 1602 n = a->i[i + 1] - a->i[i]; 1603 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1604 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1605 for (j = 0; j < n; j++) { 1606 const PetscScalar aatmp = aa[j]; 1607 const PetscInt ajtmp = aj[j]; 1608 r1 += aatmp * b1[ajtmp]; 1609 r2 += aatmp * b2[ajtmp]; 1610 r3 += aatmp * b3[ajtmp]; 1611 } 1612 if (add) { 1613 c1[i] += r1; 1614 c2[i] += r2; 1615 c3[i] += r3; 1616 } else { 1617 c1[i] = r1; 1618 c2[i] = r2; 1619 c3[i] = r3; 1620 } 1621 } 1622 } 1623 } 1624 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1625 if (add) { 1626 PetscCall(MatDenseRestoreArray(C, &c)); 1627 } else { 1628 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1629 } 1630 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1631 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1632 PetscFunctionReturn(PETSC_SUCCESS); 1633 } 1634 1635 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1636 { 1637 PetscFunctionBegin; 1638 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1639 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1640 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1641 1642 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1647 { 1648 PetscFunctionBegin; 1649 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1650 C->ops->productsymbolic = MatProductSymbolic_AB; 1651 PetscFunctionReturn(PETSC_SUCCESS); 1652 } 1653 1654 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1655 1656 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1657 { 1658 PetscFunctionBegin; 1659 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1660 C->ops->productsymbolic = MatProductSymbolic_AtB; 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1665 { 1666 PetscFunctionBegin; 1667 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1668 C->ops->productsymbolic = MatProductSymbolic_ABt; 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1673 { 1674 Mat_Product *product = C->product; 1675 1676 PetscFunctionBegin; 1677 switch (product->type) { 1678 case MATPRODUCT_AB: 1679 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1680 break; 1681 case MATPRODUCT_AtB: 1682 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1683 break; 1684 case MATPRODUCT_ABt: 1685 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1686 break; 1687 default: 1688 break; 1689 } 1690 PetscFunctionReturn(PETSC_SUCCESS); 1691 } 1692 1693 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1694 { 1695 Mat_Product *product = C->product; 1696 Mat A = product->A; 1697 PetscBool baij; 1698 1699 PetscFunctionBegin; 1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1701 if (!baij) { /* A is seqsbaij */ 1702 PetscBool sbaij; 1703 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1704 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1705 1706 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1707 } else { /* A is seqbaij */ 1708 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1709 } 1710 1711 C->ops->productsymbolic = MatProductSymbolic_AB; 1712 PetscFunctionReturn(PETSC_SUCCESS); 1713 } 1714 1715 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1716 { 1717 Mat_Product *product = C->product; 1718 1719 PetscFunctionBegin; 1720 MatCheckProduct(C, 1); 1721 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1722 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1723 else if (product->type == MATPRODUCT_AtB) { 1724 PetscBool flg; 1725 1726 PetscCall(PetscObjectTypeCompare((PetscObject)product->A, MATSEQBAIJ, &flg)); 1727 if (flg) { 1728 C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense; 1729 C->ops->productsymbolic = MatProductSymbolic_AtB; 1730 } 1731 } 1732 PetscFunctionReturn(PETSC_SUCCESS); 1733 } 1734 1735 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1736 { 1737 PetscFunctionBegin; 1738 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1739 C->ops->productsymbolic = MatProductSymbolic_AB; 1740 PetscFunctionReturn(PETSC_SUCCESS); 1741 } 1742 1743 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1744 { 1745 Mat_Product *product = C->product; 1746 1747 PetscFunctionBegin; 1748 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1749 PetscFunctionReturn(PETSC_SUCCESS); 1750 } 1751 1752 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1753 { 1754 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1755 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1756 PetscInt *bi = b->i, *bj = b->j; 1757 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1758 MatScalar *btval, *btval_den, *ba = b->a; 1759 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1760 1761 PetscFunctionBegin; 1762 btval_den = btdense->v; 1763 PetscCall(PetscArrayzero(btval_den, m * n)); 1764 for (k = 0; k < ncolors; k++) { 1765 ncolumns = coloring->ncolumns[k]; 1766 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1767 col = *(columns + colorforcol[k] + l); 1768 btcol = bj + bi[col]; 1769 btval = ba + bi[col]; 1770 anz = bi[col + 1] - bi[col]; 1771 for (j = 0; j < anz; j++) { 1772 brow = btcol[j]; 1773 btval_den[brow] = btval[j]; 1774 } 1775 } 1776 btval_den += m; 1777 } 1778 PetscFunctionReturn(PETSC_SUCCESS); 1779 } 1780 1781 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1782 { 1783 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1784 const PetscScalar *ca_den, *ca_den_ptr; 1785 PetscScalar *ca = csp->a; 1786 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1787 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1788 PetscInt nrows, *row, *idx; 1789 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1790 1791 PetscFunctionBegin; 1792 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1793 1794 if (brows > 0) { 1795 PetscInt *lstart, row_end, row_start; 1796 lstart = matcoloring->lstart; 1797 PetscCall(PetscArrayzero(lstart, ncolors)); 1798 1799 row_end = brows; 1800 if (row_end > m) row_end = m; 1801 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1802 ca_den_ptr = ca_den; 1803 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1804 nrows = matcoloring->nrows[k]; 1805 row = rows + colorforrow[k]; 1806 idx = den2sp + colorforrow[k]; 1807 for (l = lstart[k]; l < nrows; l++) { 1808 if (row[l] >= row_end) { 1809 lstart[k] = l; 1810 break; 1811 } else { 1812 ca[idx[l]] = ca_den_ptr[row[l]]; 1813 } 1814 } 1815 ca_den_ptr += m; 1816 } 1817 row_end += brows; 1818 if (row_end > m) row_end = m; 1819 } 1820 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1821 ca_den_ptr = ca_den; 1822 for (k = 0; k < ncolors; k++) { 1823 nrows = matcoloring->nrows[k]; 1824 row = rows + colorforrow[k]; 1825 idx = den2sp + colorforrow[k]; 1826 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1827 ca_den_ptr += m; 1828 } 1829 } 1830 1831 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1832 #if defined(PETSC_USE_INFO) 1833 if (matcoloring->brows > 0) { 1834 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1835 } else { 1836 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1837 } 1838 #endif 1839 PetscFunctionReturn(PETSC_SUCCESS); 1840 } 1841 1842 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1843 { 1844 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1845 const PetscInt *is, *ci, *cj, *row_idx; 1846 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1847 IS *isa; 1848 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1849 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1850 PetscInt *colorforcol, *columns, *columns_i, brows; 1851 PetscBool flg; 1852 1853 PetscFunctionBegin; 1854 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1855 1856 /* bs > 1 is not being tested yet! */ 1857 Nbs = mat->cmap->N / bs; 1858 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1859 c->N = Nbs; 1860 c->m = c->M; 1861 c->rstart = 0; 1862 c->brows = 100; 1863 1864 c->ncolors = nis; 1865 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1866 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1867 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1868 1869 brows = c->brows; 1870 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1871 if (flg) c->brows = brows; 1872 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1873 1874 colorforrow[0] = 0; 1875 rows_i = rows; 1876 den2sp_i = den2sp; 1877 1878 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1879 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1880 1881 colorforcol[0] = 0; 1882 columns_i = columns; 1883 1884 /* get column-wise storage of mat */ 1885 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1886 1887 cm = c->m; 1888 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1889 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1890 for (i = 0; i < nis; i++) { /* loop over color */ 1891 PetscCall(ISGetLocalSize(isa[i], &n)); 1892 PetscCall(ISGetIndices(isa[i], &is)); 1893 1894 c->ncolumns[i] = n; 1895 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1896 colorforcol[i + 1] = colorforcol[i] + n; 1897 columns_i += n; 1898 1899 /* fast, crude version requires O(N*N) work */ 1900 PetscCall(PetscArrayzero(rowhit, cm)); 1901 1902 for (j = 0; j < n; j++) { /* loop over columns*/ 1903 col = is[j]; 1904 row_idx = cj + ci[col]; 1905 m = ci[col + 1] - ci[col]; 1906 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1907 idxhit[*row_idx] = spidx[ci[col] + k]; 1908 rowhit[*row_idx++] = col + 1; 1909 } 1910 } 1911 /* count the number of hits */ 1912 nrows = 0; 1913 for (j = 0; j < cm; j++) { 1914 if (rowhit[j]) nrows++; 1915 } 1916 c->nrows[i] = nrows; 1917 colorforrow[i + 1] = colorforrow[i] + nrows; 1918 1919 nrows = 0; 1920 for (j = 0; j < cm; j++) { /* loop over rows */ 1921 if (rowhit[j]) { 1922 rows_i[nrows] = j; 1923 den2sp_i[nrows] = idxhit[j]; 1924 nrows++; 1925 } 1926 } 1927 den2sp_i += nrows; 1928 1929 PetscCall(ISRestoreIndices(isa[i], &is)); 1930 rows_i += nrows; 1931 } 1932 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1933 PetscCall(PetscFree(rowhit)); 1934 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1935 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1936 1937 c->colorforrow = colorforrow; 1938 c->rows = rows; 1939 c->den2sp = den2sp; 1940 c->colorforcol = colorforcol; 1941 c->columns = columns; 1942 1943 PetscCall(PetscFree(idxhit)); 1944 PetscFunctionReturn(PETSC_SUCCESS); 1945 } 1946 1947 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1948 { 1949 Mat_Product *product = C->product; 1950 Mat A = product->A, B = product->B; 1951 1952 PetscFunctionBegin; 1953 if (C->ops->mattransposemultnumeric) { 1954 /* Alg: "outerproduct" */ 1955 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1956 } else { 1957 /* Alg: "matmatmult" -- C = At*B */ 1958 MatProductCtx_MatTransMatMult *atb = (MatProductCtx_MatTransMatMult *)product->data; 1959 1960 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1961 if (atb->At) { 1962 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1963 user may have called MatProductReplaceMats() to get this A=product->A */ 1964 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1965 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1966 } 1967 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1968 } 1969 PetscFunctionReturn(PETSC_SUCCESS); 1970 } 1971 1972 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1973 { 1974 Mat_Product *product = C->product; 1975 Mat A = product->A, B = product->B; 1976 PetscReal fill = product->fill; 1977 1978 PetscFunctionBegin; 1979 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1980 1981 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1982 PetscFunctionReturn(PETSC_SUCCESS); 1983 } 1984 1985 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1986 { 1987 Mat_Product *product = C->product; 1988 PetscInt alg = 0; /* default algorithm */ 1989 PetscBool flg = PETSC_FALSE; 1990 #if !defined(PETSC_HAVE_HYPRE) 1991 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1992 PetscInt nalg = 7; 1993 #else 1994 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1995 PetscInt nalg = 8; 1996 #endif 1997 1998 PetscFunctionBegin; 1999 /* Set default algorithm */ 2000 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2001 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2002 2003 /* Get runtime option */ 2004 if (product->api_user) { 2005 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2006 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2007 PetscOptionsEnd(); 2008 } else { 2009 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2010 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2011 PetscOptionsEnd(); 2012 } 2013 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2014 2015 C->ops->productsymbolic = MatProductSymbolic_AB; 2016 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2017 PetscFunctionReturn(PETSC_SUCCESS); 2018 } 2019 2020 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2021 { 2022 Mat_Product *product = C->product; 2023 PetscInt alg = 0; /* default algorithm */ 2024 PetscBool flg = PETSC_FALSE; 2025 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2026 PetscInt nalg = 3; 2027 2028 PetscFunctionBegin; 2029 /* Get runtime option */ 2030 if (product->api_user) { 2031 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2032 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2033 PetscOptionsEnd(); 2034 } else { 2035 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2036 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2037 PetscOptionsEnd(); 2038 } 2039 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2040 2041 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2042 PetscFunctionReturn(PETSC_SUCCESS); 2043 } 2044 2045 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2046 { 2047 Mat_Product *product = C->product; 2048 PetscInt alg = 0; /* default algorithm */ 2049 PetscBool flg = PETSC_FALSE; 2050 const char *algTypes[2] = {"default", "color"}; 2051 PetscInt nalg = 2; 2052 2053 PetscFunctionBegin; 2054 /* Set default algorithm */ 2055 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2056 if (!flg) { 2057 alg = 1; 2058 PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2059 } 2060 2061 /* Get runtime option */ 2062 if (product->api_user) { 2063 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2064 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2065 PetscOptionsEnd(); 2066 } else { 2067 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2068 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2069 PetscOptionsEnd(); 2070 } 2071 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2072 2073 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2074 C->ops->productsymbolic = MatProductSymbolic_ABt; 2075 PetscFunctionReturn(PETSC_SUCCESS); 2076 } 2077 2078 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2079 { 2080 Mat_Product *product = C->product; 2081 PetscBool flg = PETSC_FALSE; 2082 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2083 #if !defined(PETSC_HAVE_HYPRE) 2084 const char *algTypes[2] = {"scalable", "rap"}; 2085 PetscInt nalg = 2; 2086 #else 2087 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2088 PetscInt nalg = 3; 2089 #endif 2090 2091 PetscFunctionBegin; 2092 /* Set default algorithm */ 2093 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2094 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2095 2096 /* Get runtime option */ 2097 if (product->api_user) { 2098 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2099 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2100 PetscOptionsEnd(); 2101 } else { 2102 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2103 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2104 PetscOptionsEnd(); 2105 } 2106 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2107 2108 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2109 PetscFunctionReturn(PETSC_SUCCESS); 2110 } 2111 2112 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2113 { 2114 Mat_Product *product = C->product; 2115 PetscBool flg = PETSC_FALSE; 2116 PetscInt alg = 0; /* default algorithm */ 2117 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2118 PetscInt nalg = 3; 2119 2120 PetscFunctionBegin; 2121 /* Set default algorithm */ 2122 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2123 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2124 2125 /* Get runtime option */ 2126 if (product->api_user) { 2127 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2128 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2129 PetscOptionsEnd(); 2130 } else { 2131 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2132 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2133 PetscOptionsEnd(); 2134 } 2135 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2136 2137 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2138 PetscFunctionReturn(PETSC_SUCCESS); 2139 } 2140 2141 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2142 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2143 { 2144 Mat_Product *product = C->product; 2145 PetscInt alg = 0; /* default algorithm */ 2146 PetscBool flg = PETSC_FALSE; 2147 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2148 PetscInt nalg = 7; 2149 2150 PetscFunctionBegin; 2151 /* Set default algorithm */ 2152 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2153 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2154 2155 /* Get runtime option */ 2156 if (product->api_user) { 2157 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2158 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2159 PetscOptionsEnd(); 2160 } else { 2161 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2162 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2163 PetscOptionsEnd(); 2164 } 2165 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2166 2167 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2168 C->ops->productsymbolic = MatProductSymbolic_ABC; 2169 PetscFunctionReturn(PETSC_SUCCESS); 2170 } 2171 2172 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2173 { 2174 Mat_Product *product = C->product; 2175 2176 PetscFunctionBegin; 2177 switch (product->type) { 2178 case MATPRODUCT_AB: 2179 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2180 break; 2181 case MATPRODUCT_AtB: 2182 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2183 break; 2184 case MATPRODUCT_ABt: 2185 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2186 break; 2187 case MATPRODUCT_PtAP: 2188 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2189 break; 2190 case MATPRODUCT_RARt: 2191 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2192 break; 2193 case MATPRODUCT_ABC: 2194 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2195 break; 2196 default: 2197 break; 2198 } 2199 PetscFunctionReturn(PETSC_SUCCESS); 2200 } 2201