1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)(mat)->data; 39 ofree_a = aij->free_a; 40 ofree_ij = aij->free_ij; 41 /* changes the free flags */ 42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43 44 PetscCall(PetscFree(aij->ilen)); 45 PetscCall(PetscFree(aij->imax)); 46 PetscCall(PetscMalloc1(m, &aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->ilen)); 48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49 const PetscInt rnz = i[ii + 1] - i[ii]; 50 aij->nonzerorowcnt += !!rnz; 51 aij->rmax = PetscMax(aij->rmax, rnz); 52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53 } 54 aij->maxnz = i[m]; 55 aij->nz = i[m]; 56 57 if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a)); 58 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j)); 59 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i)); 60 61 aij->i = i; 62 aij->j = j; 63 aij->a = a; 64 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 65 aij->free_a = PETSC_FALSE; 66 aij->free_ij = PETSC_FALSE; 67 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 68 // Always build the diag info when i, j are set 69 PetscCall(MatMarkDiagonal_SeqAIJ(mat)); 70 PetscFunctionReturn(PETSC_SUCCESS); 71 } 72 73 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 74 { 75 Mat_Product *product = C->product; 76 MatProductAlgorithm alg; 77 PetscBool flg; 78 79 PetscFunctionBegin; 80 if (product) { 81 alg = product->alg; 82 } else { 83 alg = "sorted"; 84 } 85 /* sorted */ 86 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 87 if (flg) { 88 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* scalable */ 93 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 94 if (flg) { 95 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 96 PetscFunctionReturn(PETSC_SUCCESS); 97 } 98 99 /* scalable_fast */ 100 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 101 if (flg) { 102 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 103 PetscFunctionReturn(PETSC_SUCCESS); 104 } 105 106 /* heap */ 107 PetscCall(PetscStrcmp(alg, "heap", &flg)); 108 if (flg) { 109 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 /* btheap */ 114 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 115 if (flg) { 116 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 117 PetscFunctionReturn(PETSC_SUCCESS); 118 } 119 120 /* llcondensed */ 121 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 122 if (flg) { 123 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 /* rowmerge */ 128 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 129 if (flg) { 130 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 #if defined(PETSC_HAVE_HYPRE) 135 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 136 if (flg) { 137 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 138 PetscFunctionReturn(PETSC_SUCCESS); 139 } 140 #endif 141 142 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 143 } 144 145 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 146 { 147 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 148 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 149 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 150 PetscReal afill; 151 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 152 PetscHMapI ta; 153 PetscBT lnkbt; 154 PetscFreeSpaceList free_space = NULL, current_space = NULL; 155 156 PetscFunctionBegin; 157 /* Get ci and cj */ 158 /* Allocate ci array, arrays for fill computation and */ 159 /* free space for accumulating nonzero column info */ 160 PetscCall(PetscMalloc1(am + 2, &ci)); 161 ci[0] = 0; 162 163 /* create and initialize a linked list */ 164 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 165 MatRowMergeMax_SeqAIJ(b, bm, ta); 166 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 167 PetscCall(PetscHMapIDestroy(&ta)); 168 169 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 170 171 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 172 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 173 174 current_space = free_space; 175 176 /* Determine ci and cj */ 177 for (i = 0; i < am; i++) { 178 anzi = ai[i + 1] - ai[i]; 179 aj = a->j + ai[i]; 180 for (j = 0; j < anzi; j++) { 181 brow = aj[j]; 182 bnzj = bi[brow + 1] - bi[brow]; 183 bj = b->j + bi[brow]; 184 /* add non-zero cols of B into the sorted linked list lnk */ 185 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 186 } 187 /* add possible missing diagonal entry */ 188 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 189 cnzi = lnk[0]; 190 191 /* If free space is not available, make more free space */ 192 /* Double the amount of total space in the list */ 193 if (current_space->local_remaining < cnzi) { 194 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 195 ndouble++; 196 } 197 198 /* Copy data into free space, then initialize lnk */ 199 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 200 201 current_space->array += cnzi; 202 current_space->local_used += cnzi; 203 current_space->local_remaining -= cnzi; 204 205 ci[i + 1] = ci[i] + cnzi; 206 } 207 208 /* Column indices are in the list of free space */ 209 /* Allocate space for cj, initialize cj, and */ 210 /* destroy list of free space and other temporary array(s) */ 211 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 212 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 213 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 214 215 /* put together the new symbolic matrix */ 216 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 217 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 218 219 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 220 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 221 c = (Mat_SeqAIJ *)C->data; 222 c->free_a = PETSC_FALSE; 223 c->free_ij = PETSC_TRUE; 224 c->nonew = 0; 225 226 /* fast, needs non-scalable O(bn) array 'abdense' */ 227 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 228 229 /* set MatInfo */ 230 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 231 if (afill < 1.0) afill = 1.0; 232 C->info.mallocs = ndouble; 233 C->info.fill_ratio_given = fill; 234 C->info.fill_ratio_needed = afill; 235 236 #if defined(PETSC_USE_INFO) 237 if (ci[am]) { 238 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 239 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 240 } else { 241 PetscCall(PetscInfo(C, "Empty matrix product\n")); 242 } 243 #endif 244 PetscFunctionReturn(PETSC_SUCCESS); 245 } 246 247 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 248 { 249 PetscLogDouble flops = 0.0; 250 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 251 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 252 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 253 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 254 PetscInt am = A->rmap->n, cm = C->rmap->n; 255 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 256 PetscScalar *ca, valtmp; 257 PetscScalar *ab_dense; 258 PetscContainer cab_dense; 259 const PetscScalar *aa, *ba, *baj; 260 261 PetscFunctionBegin; 262 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 263 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 264 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 265 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 266 c->a = ca; 267 c->free_a = PETSC_TRUE; 268 } else ca = c->a; 269 270 /* TODO this should be done in the symbolic phase */ 271 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 272 that is hard to eradicate) */ 273 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 274 if (!cab_dense) { 275 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 276 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 277 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 278 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 279 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 280 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 281 } 282 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 283 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 284 285 /* clean old values in C */ 286 PetscCall(PetscArrayzero(ca, ci[cm])); 287 /* Traverse A row-wise. */ 288 /* Build the ith row in C by summing over nonzero columns in A, */ 289 /* the rows of B corresponding to nonzeros of A. */ 290 for (i = 0; i < am; i++) { 291 anzi = ai[i + 1] - ai[i]; 292 for (j = 0; j < anzi; j++) { 293 brow = aj[j]; 294 bnzi = bi[brow + 1] - bi[brow]; 295 bjj = PetscSafePointerPlusOffset(bj, bi[brow]); 296 baj = PetscSafePointerPlusOffset(ba, bi[brow]); 297 /* perform dense axpy */ 298 valtmp = aa[j]; 299 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 300 flops += 2 * bnzi; 301 } 302 aj = PetscSafePointerPlusOffset(aj, anzi); 303 aa = PetscSafePointerPlusOffset(aa, anzi); 304 305 cnzi = ci[i + 1] - ci[i]; 306 for (k = 0; k < cnzi; k++) { 307 ca[k] += ab_dense[cj[k]]; 308 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 309 } 310 flops += cnzi; 311 cj = PetscSafePointerPlusOffset(cj, cnzi); 312 ca += cnzi; 313 } 314 #if defined(PETSC_HAVE_DEVICE) 315 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 316 #endif 317 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 318 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 319 PetscCall(PetscLogFlops(flops)); 320 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 321 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 322 PetscFunctionReturn(PETSC_SUCCESS); 323 } 324 325 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 326 { 327 PetscLogDouble flops = 0.0; 328 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 329 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 330 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 331 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 332 PetscInt am = A->rmap->N, cm = C->rmap->N; 333 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 334 PetscScalar *ca = c->a, valtmp; 335 const PetscScalar *aa, *ba, *baj; 336 PetscInt nextb; 337 338 PetscFunctionBegin; 339 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 340 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 341 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 342 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 343 c->a = ca; 344 c->free_a = PETSC_TRUE; 345 } 346 347 /* clean old values in C */ 348 PetscCall(PetscArrayzero(ca, ci[cm])); 349 /* Traverse A row-wise. */ 350 /* Build the ith row in C by summing over nonzero columns in A, */ 351 /* the rows of B corresponding to nonzeros of A. */ 352 for (i = 0; i < am; i++) { 353 anzi = ai[i + 1] - ai[i]; 354 cnzi = ci[i + 1] - ci[i]; 355 for (j = 0; j < anzi; j++) { 356 brow = aj[j]; 357 bnzi = bi[brow + 1] - bi[brow]; 358 bjj = bj + bi[brow]; 359 baj = ba + bi[brow]; 360 /* perform sparse axpy */ 361 valtmp = aa[j]; 362 nextb = 0; 363 for (k = 0; nextb < bnzi; k++) { 364 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 365 ca[k] += valtmp * baj[nextb++]; 366 } 367 } 368 flops += 2 * bnzi; 369 } 370 aj += anzi; 371 aa += anzi; 372 cj += cnzi; 373 ca += cnzi; 374 } 375 #if defined(PETSC_HAVE_DEVICE) 376 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 377 #endif 378 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 379 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 380 PetscCall(PetscLogFlops(flops)); 381 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 382 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 387 { 388 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 389 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 390 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 391 MatScalar *ca; 392 PetscReal afill; 393 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 394 PetscHMapI ta; 395 PetscFreeSpaceList free_space = NULL, current_space = NULL; 396 397 PetscFunctionBegin; 398 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 399 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 400 PetscCall(PetscMalloc1(am + 2, &ci)); 401 ci[0] = 0; 402 403 /* create and initialize a linked list */ 404 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 405 MatRowMergeMax_SeqAIJ(b, bm, ta); 406 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 407 PetscCall(PetscHMapIDestroy(&ta)); 408 409 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 410 411 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 412 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 413 current_space = free_space; 414 415 /* Determine ci and cj */ 416 for (i = 0; i < am; i++) { 417 anzi = ai[i + 1] - ai[i]; 418 aj = a->j + ai[i]; 419 for (j = 0; j < anzi; j++) { 420 brow = aj[j]; 421 bnzj = bi[brow + 1] - bi[brow]; 422 bj = b->j + bi[brow]; 423 /* add non-zero cols of B into the sorted linked list lnk */ 424 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 425 } 426 /* add possible missing diagonal entry */ 427 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 428 cnzi = lnk[1]; 429 430 /* If free space is not available, make more free space */ 431 /* Double the amount of total space in the list */ 432 if (current_space->local_remaining < cnzi) { 433 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 434 ndouble++; 435 } 436 437 /* Copy data into free space, then initialize lnk */ 438 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 439 440 current_space->array += cnzi; 441 current_space->local_used += cnzi; 442 current_space->local_remaining -= cnzi; 443 444 ci[i + 1] = ci[i] + cnzi; 445 } 446 447 /* Column indices are in the list of free space */ 448 /* Allocate space for cj, initialize cj, and */ 449 /* destroy list of free space and other temporary array(s) */ 450 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 451 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 452 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 453 454 /* Allocate space for ca */ 455 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 456 457 /* put together the new symbolic matrix */ 458 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 459 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 460 461 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 462 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 463 c = (Mat_SeqAIJ *)C->data; 464 c->free_a = PETSC_TRUE; 465 c->free_ij = PETSC_TRUE; 466 c->nonew = 0; 467 468 /* slower, less memory */ 469 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 470 471 /* set MatInfo */ 472 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 473 if (afill < 1.0) afill = 1.0; 474 C->info.mallocs = ndouble; 475 C->info.fill_ratio_given = fill; 476 C->info.fill_ratio_needed = afill; 477 478 #if defined(PETSC_USE_INFO) 479 if (ci[am]) { 480 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 481 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 482 } else { 483 PetscCall(PetscInfo(C, "Empty matrix product\n")); 484 } 485 #endif 486 PetscFunctionReturn(PETSC_SUCCESS); 487 } 488 489 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 492 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 493 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 494 MatScalar *ca; 495 PetscReal afill; 496 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 497 PetscHMapI ta; 498 PetscFreeSpaceList free_space = NULL, current_space = NULL; 499 500 PetscFunctionBegin; 501 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 502 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 503 PetscCall(PetscMalloc1(am + 2, &ci)); 504 ci[0] = 0; 505 506 /* create and initialize a linked list */ 507 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 508 MatRowMergeMax_SeqAIJ(b, bm, ta); 509 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 510 PetscCall(PetscHMapIDestroy(&ta)); 511 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 512 513 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 514 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 515 current_space = free_space; 516 517 /* Determine ci and cj */ 518 for (i = 0; i < am; i++) { 519 anzi = ai[i + 1] - ai[i]; 520 aj = a->j + ai[i]; 521 for (j = 0; j < anzi; j++) { 522 brow = aj[j]; 523 bnzj = bi[brow + 1] - bi[brow]; 524 bj = b->j + bi[brow]; 525 /* add non-zero cols of B into the sorted linked list lnk */ 526 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 527 } 528 /* add possible missing diagonal entry */ 529 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 530 531 cnzi = lnk[0]; 532 533 /* If free space is not available, make more free space */ 534 /* Double the amount of total space in the list */ 535 if (current_space->local_remaining < cnzi) { 536 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 537 ndouble++; 538 } 539 540 /* Copy data into free space, then initialize lnk */ 541 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 542 543 current_space->array += cnzi; 544 current_space->local_used += cnzi; 545 current_space->local_remaining -= cnzi; 546 547 ci[i + 1] = ci[i] + cnzi; 548 } 549 550 /* Column indices are in the list of free space */ 551 /* Allocate space for cj, initialize cj, and */ 552 /* destroy list of free space and other temporary array(s) */ 553 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 554 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 555 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 556 557 /* Allocate space for ca */ 558 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 559 560 /* put together the new symbolic matrix */ 561 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 562 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 563 564 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 565 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 566 c = (Mat_SeqAIJ *)C->data; 567 c->free_a = PETSC_TRUE; 568 c->free_ij = PETSC_TRUE; 569 c->nonew = 0; 570 571 /* slower, less memory */ 572 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 573 574 /* set MatInfo */ 575 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 576 if (afill < 1.0) afill = 1.0; 577 C->info.mallocs = ndouble; 578 C->info.fill_ratio_given = fill; 579 C->info.fill_ratio_needed = afill; 580 581 #if defined(PETSC_USE_INFO) 582 if (ci[am]) { 583 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 584 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 585 } else { 586 PetscCall(PetscInfo(C, "Empty matrix product\n")); 587 } 588 #endif 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 592 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 593 { 594 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 595 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 596 PetscInt *ci, *cj, *bb; 597 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 598 PetscReal afill; 599 PetscInt i, j, col, ndouble = 0; 600 PetscFreeSpaceList free_space = NULL, current_space = NULL; 601 PetscHeap h; 602 603 PetscFunctionBegin; 604 /* Get ci and cj - by merging sorted rows using a heap */ 605 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 606 PetscCall(PetscMalloc1(am + 2, &ci)); 607 ci[0] = 0; 608 609 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 610 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 611 current_space = free_space; 612 613 PetscCall(PetscHeapCreate(a->rmax, &h)); 614 PetscCall(PetscMalloc1(a->rmax, &bb)); 615 616 /* Determine ci and cj */ 617 for (i = 0; i < am; i++) { 618 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 619 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 620 ci[i + 1] = ci[i]; 621 /* Populate the min heap */ 622 for (j = 0; j < anzi; j++) { 623 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 624 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 625 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 626 } 627 } 628 /* Pick off the min element, adding it to free space */ 629 PetscCall(PetscHeapPop(h, &j, &col)); 630 while (j >= 0) { 631 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 632 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 633 ndouble++; 634 } 635 *(current_space->array++) = col; 636 current_space->local_used++; 637 current_space->local_remaining--; 638 ci[i + 1]++; 639 640 /* stash if anything else remains in this row of B */ 641 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 642 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 643 PetscInt j2, col2; 644 PetscCall(PetscHeapPeek(h, &j2, &col2)); 645 if (col2 != col) break; 646 PetscCall(PetscHeapPop(h, &j2, &col2)); 647 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 648 } 649 /* Put any stashed elements back into the min heap */ 650 PetscCall(PetscHeapUnstash(h)); 651 PetscCall(PetscHeapPop(h, &j, &col)); 652 } 653 } 654 PetscCall(PetscFree(bb)); 655 PetscCall(PetscHeapDestroy(&h)); 656 657 /* Column indices are in the list of free space */ 658 /* Allocate space for cj, initialize cj, and */ 659 /* destroy list of free space and other temporary array(s) */ 660 PetscCall(PetscMalloc1(ci[am], &cj)); 661 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 662 663 /* put together the new symbolic matrix */ 664 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 665 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 666 667 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 668 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 669 c = (Mat_SeqAIJ *)C->data; 670 c->free_a = PETSC_TRUE; 671 c->free_ij = PETSC_TRUE; 672 c->nonew = 0; 673 674 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 675 676 /* set MatInfo */ 677 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 678 if (afill < 1.0) afill = 1.0; 679 C->info.mallocs = ndouble; 680 C->info.fill_ratio_given = fill; 681 C->info.fill_ratio_needed = afill; 682 683 #if defined(PETSC_USE_INFO) 684 if (ci[am]) { 685 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 686 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 687 } else { 688 PetscCall(PetscInfo(C, "Empty matrix product\n")); 689 } 690 #endif 691 PetscFunctionReturn(PETSC_SUCCESS); 692 } 693 694 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 695 { 696 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 697 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 698 PetscInt *ci, *cj, *bb; 699 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 700 PetscReal afill; 701 PetscInt i, j, col, ndouble = 0; 702 PetscFreeSpaceList free_space = NULL, current_space = NULL; 703 PetscHeap h; 704 PetscBT bt; 705 706 PetscFunctionBegin; 707 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 708 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 709 PetscCall(PetscMalloc1(am + 2, &ci)); 710 ci[0] = 0; 711 712 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 713 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 714 715 current_space = free_space; 716 717 PetscCall(PetscHeapCreate(a->rmax, &h)); 718 PetscCall(PetscMalloc1(a->rmax, &bb)); 719 PetscCall(PetscBTCreate(bn, &bt)); 720 721 /* Determine ci and cj */ 722 for (i = 0; i < am; i++) { 723 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 724 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 725 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 726 ci[i + 1] = ci[i]; 727 /* Populate the min heap */ 728 for (j = 0; j < anzi; j++) { 729 PetscInt brow = acol[j]; 730 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 731 PetscInt bcol = bj[bb[j]]; 732 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 733 PetscCall(PetscHeapAdd(h, j, bcol)); 734 bb[j]++; 735 break; 736 } 737 } 738 } 739 /* Pick off the min element, adding it to free space */ 740 PetscCall(PetscHeapPop(h, &j, &col)); 741 while (j >= 0) { 742 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 743 fptr = NULL; /* need PetscBTMemzero */ 744 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 745 ndouble++; 746 } 747 *(current_space->array++) = col; 748 current_space->local_used++; 749 current_space->local_remaining--; 750 ci[i + 1]++; 751 752 /* stash if anything else remains in this row of B */ 753 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 754 PetscInt bcol = bj[bb[j]]; 755 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 756 PetscCall(PetscHeapAdd(h, j, bcol)); 757 bb[j]++; 758 break; 759 } 760 } 761 PetscCall(PetscHeapPop(h, &j, &col)); 762 } 763 if (fptr) { /* Clear the bits for this row */ 764 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 765 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 766 PetscCall(PetscBTMemzero(bn, bt)); 767 } 768 } 769 PetscCall(PetscFree(bb)); 770 PetscCall(PetscHeapDestroy(&h)); 771 PetscCall(PetscBTDestroy(&bt)); 772 773 /* Column indices are in the list of free space */ 774 /* Allocate space for cj, initialize cj, and */ 775 /* destroy list of free space and other temporary array(s) */ 776 PetscCall(PetscMalloc1(ci[am], &cj)); 777 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 778 779 /* put together the new symbolic matrix */ 780 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 781 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 782 783 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 784 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 785 c = (Mat_SeqAIJ *)C->data; 786 c->free_a = PETSC_TRUE; 787 c->free_ij = PETSC_TRUE; 788 c->nonew = 0; 789 790 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 791 792 /* set MatInfo */ 793 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 794 if (afill < 1.0) afill = 1.0; 795 C->info.mallocs = ndouble; 796 C->info.fill_ratio_given = fill; 797 C->info.fill_ratio_needed = afill; 798 799 #if defined(PETSC_USE_INFO) 800 if (ci[am]) { 801 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 802 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 803 } else { 804 PetscCall(PetscInfo(C, "Empty matrix product\n")); 805 } 806 #endif 807 PetscFunctionReturn(PETSC_SUCCESS); 808 } 809 810 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 811 { 812 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 813 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 814 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 815 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 816 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 817 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 818 const PetscInt *brow_ptr[8], *brow_end[8]; 819 PetscInt window[8]; 820 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 821 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 822 PetscReal afill; 823 PetscInt *workj_L1, *workj_L2, *workj_L3; 824 PetscInt L1_nnz, L2_nnz; 825 826 /* Step 1: Get upper bound on memory required for allocation. 827 Because of the way virtual memory works, 828 only the memory pages that are actually needed will be physically allocated. */ 829 PetscFunctionBegin; 830 PetscCall(PetscMalloc1(am + 1, &ci)); 831 for (i = 0; i < am; i++) { 832 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 833 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 834 a_rownnz = 0; 835 for (k = 0; k < anzi; ++k) { 836 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 837 if (a_rownnz > bn) { 838 a_rownnz = bn; 839 break; 840 } 841 } 842 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 843 } 844 /* temporary work areas for merging rows */ 845 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 846 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 847 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 848 849 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 850 c_maxmem = 8 * (ai[am] + bi[bm]); 851 /* Step 2: Populate pattern for C */ 852 PetscCall(PetscMalloc1(c_maxmem, &cj)); 853 854 ci_nnz = 0; 855 ci[0] = 0; 856 worki_L1[0] = 0; 857 worki_L2[0] = 0; 858 for (i = 0; i < am; i++) { 859 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 860 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 861 rowsleft = anzi; 862 inputcol_L1 = acol; 863 L2_nnz = 0; 864 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 865 worki_L2[1] = 0; 866 outputi_nnz = 0; 867 868 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 869 while (ci_nnz + a_maxrownnz > c_maxmem) { 870 c_maxmem *= 2; 871 ndouble++; 872 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 873 } 874 875 while (rowsleft) { 876 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 877 L1_nrows = 0; 878 L1_nnz = 0; 879 inputcol = inputcol_L1; 880 inputi = bi; 881 inputj = bj; 882 883 /* The following macro is used to specialize for small rows in A. 884 This helps with compiler unrolling, improving performance substantially. 885 Input: inputj inputi inputcol bn 886 Output: outputj outputi_nnz */ 887 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 888 do { \ 889 window_min = bn; \ 890 outputi_nnz = 0; \ 891 for (k = 0; k < ANNZ; ++k) { \ 892 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 893 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 894 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 895 window_min = PetscMin(window[k], window_min); \ 896 } \ 897 while (window_min < bn) { \ 898 outputj[outputi_nnz++] = window_min; \ 899 /* advance front and compute new minimum */ \ 900 old_window_min = window_min; \ 901 window_min = bn; \ 902 for (k = 0; k < ANNZ; ++k) { \ 903 if (window[k] == old_window_min) { \ 904 brow_ptr[k]++; \ 905 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 906 } \ 907 window_min = PetscMin(window[k], window_min); \ 908 } \ 909 } \ 910 } while (0) 911 912 /************** L E V E L 1 ***************/ 913 /* Merge up to 8 rows of B to L1 work array*/ 914 while (L1_rowsleft) { 915 outputi_nnz = 0; 916 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 917 else outputj = cj + ci_nnz; /* Merge directly to C */ 918 919 switch (L1_rowsleft) { 920 case 1: 921 brow_ptr[0] = inputj + inputi[inputcol[0]]; 922 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 923 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 924 inputcol += L1_rowsleft; 925 rowsleft -= L1_rowsleft; 926 L1_rowsleft = 0; 927 break; 928 case 2: 929 MatMatMultSymbolic_RowMergeMacro(2); 930 inputcol += L1_rowsleft; 931 rowsleft -= L1_rowsleft; 932 L1_rowsleft = 0; 933 break; 934 case 3: 935 MatMatMultSymbolic_RowMergeMacro(3); 936 inputcol += L1_rowsleft; 937 rowsleft -= L1_rowsleft; 938 L1_rowsleft = 0; 939 break; 940 case 4: 941 MatMatMultSymbolic_RowMergeMacro(4); 942 inputcol += L1_rowsleft; 943 rowsleft -= L1_rowsleft; 944 L1_rowsleft = 0; 945 break; 946 case 5: 947 MatMatMultSymbolic_RowMergeMacro(5); 948 inputcol += L1_rowsleft; 949 rowsleft -= L1_rowsleft; 950 L1_rowsleft = 0; 951 break; 952 case 6: 953 MatMatMultSymbolic_RowMergeMacro(6); 954 inputcol += L1_rowsleft; 955 rowsleft -= L1_rowsleft; 956 L1_rowsleft = 0; 957 break; 958 case 7: 959 MatMatMultSymbolic_RowMergeMacro(7); 960 inputcol += L1_rowsleft; 961 rowsleft -= L1_rowsleft; 962 L1_rowsleft = 0; 963 break; 964 default: 965 MatMatMultSymbolic_RowMergeMacro(8); 966 inputcol += 8; 967 rowsleft -= 8; 968 L1_rowsleft -= 8; 969 break; 970 } 971 inputcol_L1 = inputcol; 972 L1_nnz += outputi_nnz; 973 worki_L1[++L1_nrows] = L1_nnz; 974 } 975 976 /********************** L E V E L 2 ************************/ 977 /* Merge from L1 work array to either C or to L2 work array */ 978 if (anzi > 8) { 979 inputi = worki_L1; 980 inputj = workj_L1; 981 inputcol = workcol; 982 outputi_nnz = 0; 983 984 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 985 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 986 987 switch (L1_nrows) { 988 case 1: 989 brow_ptr[0] = inputj + inputi[inputcol[0]]; 990 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 991 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 992 break; 993 case 2: 994 MatMatMultSymbolic_RowMergeMacro(2); 995 break; 996 case 3: 997 MatMatMultSymbolic_RowMergeMacro(3); 998 break; 999 case 4: 1000 MatMatMultSymbolic_RowMergeMacro(4); 1001 break; 1002 case 5: 1003 MatMatMultSymbolic_RowMergeMacro(5); 1004 break; 1005 case 6: 1006 MatMatMultSymbolic_RowMergeMacro(6); 1007 break; 1008 case 7: 1009 MatMatMultSymbolic_RowMergeMacro(7); 1010 break; 1011 case 8: 1012 MatMatMultSymbolic_RowMergeMacro(8); 1013 break; 1014 default: 1015 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1016 } 1017 L2_nnz += outputi_nnz; 1018 worki_L2[++L2_nrows] = L2_nnz; 1019 1020 /************************ L E V E L 3 **********************/ 1021 /* Merge from L2 work array to either C or to L2 work array */ 1022 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1023 inputi = worki_L2; 1024 inputj = workj_L2; 1025 inputcol = workcol; 1026 outputi_nnz = 0; 1027 if (rowsleft) outputj = workj_L3; 1028 else outputj = cj + ci_nnz; 1029 switch (L2_nrows) { 1030 case 1: 1031 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1032 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1033 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1034 break; 1035 case 2: 1036 MatMatMultSymbolic_RowMergeMacro(2); 1037 break; 1038 case 3: 1039 MatMatMultSymbolic_RowMergeMacro(3); 1040 break; 1041 case 4: 1042 MatMatMultSymbolic_RowMergeMacro(4); 1043 break; 1044 case 5: 1045 MatMatMultSymbolic_RowMergeMacro(5); 1046 break; 1047 case 6: 1048 MatMatMultSymbolic_RowMergeMacro(6); 1049 break; 1050 case 7: 1051 MatMatMultSymbolic_RowMergeMacro(7); 1052 break; 1053 case 8: 1054 MatMatMultSymbolic_RowMergeMacro(8); 1055 break; 1056 default: 1057 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1058 } 1059 L2_nrows = 1; 1060 L2_nnz = outputi_nnz; 1061 worki_L2[1] = outputi_nnz; 1062 /* Copy to workj_L2 */ 1063 if (rowsleft) { 1064 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1065 } 1066 } 1067 } 1068 } /* while (rowsleft) */ 1069 #undef MatMatMultSymbolic_RowMergeMacro 1070 1071 /* terminate current row */ 1072 ci_nnz += outputi_nnz; 1073 ci[i + 1] = ci_nnz; 1074 } 1075 1076 /* Step 3: Create the new symbolic matrix */ 1077 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1078 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1079 1080 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1081 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1082 c = (Mat_SeqAIJ *)C->data; 1083 c->free_a = PETSC_TRUE; 1084 c->free_ij = PETSC_TRUE; 1085 c->nonew = 0; 1086 1087 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1088 1089 /* set MatInfo */ 1090 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1091 if (afill < 1.0) afill = 1.0; 1092 C->info.mallocs = ndouble; 1093 C->info.fill_ratio_given = fill; 1094 C->info.fill_ratio_needed = afill; 1095 1096 #if defined(PETSC_USE_INFO) 1097 if (ci[am]) { 1098 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1099 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1100 } else { 1101 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1102 } 1103 #endif 1104 1105 /* Step 4: Free temporary work areas */ 1106 PetscCall(PetscFree(workj_L1)); 1107 PetscCall(PetscFree(workj_L2)); 1108 PetscCall(PetscFree(workj_L3)); 1109 PetscFunctionReturn(PETSC_SUCCESS); 1110 } 1111 1112 /* concatenate unique entries and then sort */ 1113 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1114 { 1115 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1116 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1117 PetscInt *ci, *cj, bcol; 1118 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1119 PetscReal afill; 1120 PetscInt i, j, ndouble = 0; 1121 PetscSegBuffer seg, segrow; 1122 char *seen; 1123 1124 PetscFunctionBegin; 1125 PetscCall(PetscMalloc1(am + 1, &ci)); 1126 ci[0] = 0; 1127 1128 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1129 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1130 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1131 PetscCall(PetscCalloc1(bn, &seen)); 1132 1133 /* Determine ci and cj */ 1134 for (i = 0; i < am; i++) { 1135 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1136 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1137 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1138 1139 /* Pack segrow */ 1140 for (j = 0; j < anzi; j++) { 1141 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1142 for (k = bjstart; k < bjend; k++) { 1143 bcol = bj[k]; 1144 if (!seen[bcol]) { /* new entry */ 1145 PetscInt *PETSC_RESTRICT slot; 1146 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1147 *slot = bcol; 1148 seen[bcol] = 1; 1149 packlen++; 1150 } 1151 } 1152 } 1153 1154 /* Check i-th diagonal entry */ 1155 if (C->force_diagonals && !seen[i]) { 1156 PetscInt *PETSC_RESTRICT slot; 1157 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1158 *slot = i; 1159 seen[i] = 1; 1160 packlen++; 1161 } 1162 1163 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1164 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1165 PetscCall(PetscSortInt(packlen, crow)); 1166 ci[i + 1] = ci[i] + packlen; 1167 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1168 } 1169 PetscCall(PetscSegBufferDestroy(&segrow)); 1170 PetscCall(PetscFree(seen)); 1171 1172 /* Column indices are in the segmented buffer */ 1173 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1174 PetscCall(PetscSegBufferDestroy(&seg)); 1175 1176 /* put together the new symbolic matrix */ 1177 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1178 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1179 1180 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1181 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1182 c = (Mat_SeqAIJ *)C->data; 1183 c->free_a = PETSC_TRUE; 1184 c->free_ij = PETSC_TRUE; 1185 c->nonew = 0; 1186 1187 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1188 1189 /* set MatInfo */ 1190 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1191 if (afill < 1.0) afill = 1.0; 1192 C->info.mallocs = ndouble; 1193 C->info.fill_ratio_given = fill; 1194 C->info.fill_ratio_needed = afill; 1195 1196 #if defined(PETSC_USE_INFO) 1197 if (ci[am]) { 1198 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1199 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1200 } else { 1201 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1202 } 1203 #endif 1204 PetscFunctionReturn(PETSC_SUCCESS); 1205 } 1206 1207 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1208 { 1209 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1210 1211 PetscFunctionBegin; 1212 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1213 PetscCall(MatDestroy(&abt->Bt_den)); 1214 PetscCall(MatDestroy(&abt->ABt_den)); 1215 PetscCall(PetscFree(abt)); 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1220 { 1221 Mat Bt; 1222 Mat_MatMatTransMult *abt; 1223 Mat_Product *product = C->product; 1224 char *alg; 1225 1226 PetscFunctionBegin; 1227 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1228 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1229 1230 /* create symbolic Bt */ 1231 PetscCall(MatTransposeSymbolic(B, &Bt)); 1232 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1233 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1234 1235 /* get symbolic C=A*Bt */ 1236 PetscCall(PetscStrallocpy(product->alg, &alg)); 1237 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1238 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1239 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1240 PetscCall(PetscFree(alg)); 1241 1242 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1243 PetscCall(PetscNew(&abt)); 1244 1245 product->data = abt; 1246 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1247 1248 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1249 1250 abt->usecoloring = PETSC_FALSE; 1251 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1252 if (abt->usecoloring) { 1253 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1254 MatTransposeColoring matcoloring; 1255 MatColoring coloring; 1256 ISColoring iscoloring; 1257 Mat Bt_dense, C_dense; 1258 1259 /* inode causes memory problem */ 1260 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1261 1262 PetscCall(MatColoringCreate(C, &coloring)); 1263 PetscCall(MatColoringSetDistance(coloring, 2)); 1264 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1265 PetscCall(MatColoringSetFromOptions(coloring)); 1266 PetscCall(MatColoringApply(coloring, &iscoloring)); 1267 PetscCall(MatColoringDestroy(&coloring)); 1268 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1269 1270 abt->matcoloring = matcoloring; 1271 1272 PetscCall(ISColoringDestroy(&iscoloring)); 1273 1274 /* Create Bt_dense and C_dense = A*Bt_dense */ 1275 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1276 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1277 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1278 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1279 1280 Bt_dense->assembled = PETSC_TRUE; 1281 abt->Bt_den = Bt_dense; 1282 1283 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1284 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1285 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1286 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1287 1288 Bt_dense->assembled = PETSC_TRUE; 1289 abt->ABt_den = C_dense; 1290 1291 #if defined(PETSC_USE_INFO) 1292 { 1293 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1294 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1295 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1296 } 1297 #endif 1298 } 1299 /* clean up */ 1300 PetscCall(MatDestroy(&Bt)); 1301 PetscFunctionReturn(PETSC_SUCCESS); 1302 } 1303 1304 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1305 { 1306 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1307 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1308 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1309 PetscLogDouble flops = 0.0; 1310 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1311 Mat_MatMatTransMult *abt; 1312 Mat_Product *product = C->product; 1313 1314 PetscFunctionBegin; 1315 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1316 abt = (Mat_MatMatTransMult *)product->data; 1317 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1318 /* clear old values in C */ 1319 if (!c->a) { 1320 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1321 c->a = ca; 1322 c->free_a = PETSC_TRUE; 1323 } else { 1324 ca = c->a; 1325 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1326 } 1327 1328 if (abt->usecoloring) { 1329 MatTransposeColoring matcoloring = abt->matcoloring; 1330 Mat Bt_dense, C_dense = abt->ABt_den; 1331 1332 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1333 Bt_dense = abt->Bt_den; 1334 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1335 1336 /* C_dense = A*Bt_dense */ 1337 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1338 1339 /* Recover C from C_dense */ 1340 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1341 PetscFunctionReturn(PETSC_SUCCESS); 1342 } 1343 1344 for (i = 0; i < cm; i++) { 1345 anzi = ai[i + 1] - ai[i]; 1346 acol = PetscSafePointerPlusOffset(aj, ai[i]); 1347 aval = PetscSafePointerPlusOffset(aa, ai[i]); 1348 cnzi = ci[i + 1] - ci[i]; 1349 ccol = PetscSafePointerPlusOffset(cj, ci[i]); 1350 cval = ca + ci[i]; 1351 for (j = 0; j < cnzi; j++) { 1352 brow = ccol[j]; 1353 bnzj = bi[brow + 1] - bi[brow]; 1354 bcol = bj + bi[brow]; 1355 bval = ba + bi[brow]; 1356 1357 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1358 nexta = 0; 1359 nextb = 0; 1360 while (nexta < anzi && nextb < bnzj) { 1361 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1362 if (nexta == anzi) break; 1363 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1364 if (nextb == bnzj) break; 1365 if (acol[nexta] == bcol[nextb]) { 1366 cval[j] += aval[nexta] * bval[nextb]; 1367 nexta++; 1368 nextb++; 1369 flops += 2; 1370 } 1371 } 1372 } 1373 } 1374 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1375 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1376 PetscCall(PetscLogFlops(flops)); 1377 PetscFunctionReturn(PETSC_SUCCESS); 1378 } 1379 1380 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1381 { 1382 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1383 1384 PetscFunctionBegin; 1385 PetscCall(MatDestroy(&atb->At)); 1386 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1387 PetscCall(PetscFree(atb)); 1388 PetscFunctionReturn(PETSC_SUCCESS); 1389 } 1390 1391 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1392 { 1393 Mat At = NULL; 1394 Mat_Product *product = C->product; 1395 PetscBool flg, def, square; 1396 1397 PetscFunctionBegin; 1398 MatCheckProduct(C, 4); 1399 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1400 /* outerproduct */ 1401 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1402 if (flg) { 1403 /* create symbolic At */ 1404 if (!square) { 1405 PetscCall(MatTransposeSymbolic(A, &At)); 1406 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1407 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1408 } 1409 /* get symbolic C=At*B */ 1410 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1411 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1412 1413 /* clean up */ 1414 if (!square) PetscCall(MatDestroy(&At)); 1415 1416 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1417 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1418 PetscFunctionReturn(PETSC_SUCCESS); 1419 } 1420 1421 /* matmatmult */ 1422 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1423 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1424 if (flg || def) { 1425 Mat_MatTransMatMult *atb; 1426 1427 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1428 PetscCall(PetscNew(&atb)); 1429 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1430 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1431 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1432 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1433 product->data = atb; 1434 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1435 atb->At = At; 1436 1437 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1438 PetscFunctionReturn(PETSC_SUCCESS); 1439 } 1440 1441 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1442 } 1443 1444 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1445 { 1446 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1447 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1448 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1449 PetscLogDouble flops = 0.0; 1450 MatScalar *aa = a->a, *ba, *ca, *caj; 1451 1452 PetscFunctionBegin; 1453 if (!c->a) { 1454 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1455 1456 c->a = ca; 1457 c->free_a = PETSC_TRUE; 1458 } else { 1459 ca = c->a; 1460 PetscCall(PetscArrayzero(ca, ci[cm])); 1461 } 1462 1463 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1464 for (i = 0; i < am; i++) { 1465 bj = b->j + bi[i]; 1466 ba = b->a + bi[i]; 1467 bnzi = bi[i + 1] - bi[i]; 1468 anzi = ai[i + 1] - ai[i]; 1469 for (j = 0; j < anzi; j++) { 1470 nextb = 0; 1471 crow = *aj++; 1472 cjj = cj + ci[crow]; 1473 caj = ca + ci[crow]; 1474 /* perform sparse axpy operation. Note cjj includes bj. */ 1475 for (k = 0; nextb < bnzi; k++) { 1476 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1477 caj[k] += (*aa) * (*(ba + nextb)); 1478 nextb++; 1479 } 1480 } 1481 flops += 2 * bnzi; 1482 aa++; 1483 } 1484 } 1485 1486 /* Assemble the final matrix and clean up */ 1487 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1488 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1489 PetscCall(PetscLogFlops(flops)); 1490 PetscFunctionReturn(PETSC_SUCCESS); 1491 } 1492 1493 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1494 { 1495 PetscFunctionBegin; 1496 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1497 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1498 PetscFunctionReturn(PETSC_SUCCESS); 1499 } 1500 1501 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1502 { 1503 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1504 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1505 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1506 const PetscInt *aj; 1507 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1508 PetscInt clda; 1509 PetscInt am4, bm4, col, i, j, n; 1510 1511 PetscFunctionBegin; 1512 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1513 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1514 if (add) { 1515 PetscCall(MatDenseGetArray(C, &c)); 1516 } else { 1517 PetscCall(MatDenseGetArrayWrite(C, &c)); 1518 } 1519 PetscCall(MatDenseGetArrayRead(B, &b)); 1520 PetscCall(MatDenseGetLDA(B, &bm)); 1521 PetscCall(MatDenseGetLDA(C, &clda)); 1522 am4 = 4 * clda; 1523 bm4 = 4 * bm; 1524 if (b) { 1525 b1 = b; 1526 b2 = b1 + bm; 1527 b3 = b2 + bm; 1528 b4 = b3 + bm; 1529 } else b1 = b2 = b3 = b4 = NULL; 1530 c1 = c; 1531 c2 = c1 + clda; 1532 c3 = c2 + clda; 1533 c4 = c3 + clda; 1534 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1535 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1536 r1 = r2 = r3 = r4 = 0.0; 1537 n = a->i[i + 1] - a->i[i]; 1538 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1539 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1540 for (j = 0; j < n; j++) { 1541 const PetscScalar aatmp = aa[j]; 1542 const PetscInt ajtmp = aj[j]; 1543 r1 += aatmp * b1[ajtmp]; 1544 r2 += aatmp * b2[ajtmp]; 1545 r3 += aatmp * b3[ajtmp]; 1546 r4 += aatmp * b4[ajtmp]; 1547 } 1548 if (add) { 1549 c1[i] += r1; 1550 c2[i] += r2; 1551 c3[i] += r3; 1552 c4[i] += r4; 1553 } else { 1554 c1[i] = r1; 1555 c2[i] = r2; 1556 c3[i] = r3; 1557 c4[i] = r4; 1558 } 1559 } 1560 if (b) { 1561 b1 += bm4; 1562 b2 += bm4; 1563 b3 += bm4; 1564 b4 += bm4; 1565 } 1566 c1 += am4; 1567 c2 += am4; 1568 c3 += am4; 1569 c4 += am4; 1570 } 1571 /* process remaining columns */ 1572 if (col != cn) { 1573 PetscInt rc = cn - col; 1574 1575 if (rc == 1) { 1576 for (i = 0; i < am; i++) { 1577 r1 = 0.0; 1578 n = a->i[i + 1] - a->i[i]; 1579 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1580 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1581 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1582 if (add) c1[i] += r1; 1583 else c1[i] = r1; 1584 } 1585 } else if (rc == 2) { 1586 for (i = 0; i < am; i++) { 1587 r1 = r2 = 0.0; 1588 n = a->i[i + 1] - a->i[i]; 1589 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1590 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1591 for (j = 0; j < n; j++) { 1592 const PetscScalar aatmp = aa[j]; 1593 const PetscInt ajtmp = aj[j]; 1594 r1 += aatmp * b1[ajtmp]; 1595 r2 += aatmp * b2[ajtmp]; 1596 } 1597 if (add) { 1598 c1[i] += r1; 1599 c2[i] += r2; 1600 } else { 1601 c1[i] = r1; 1602 c2[i] = r2; 1603 } 1604 } 1605 } else { 1606 for (i = 0; i < am; i++) { 1607 r1 = r2 = r3 = 0.0; 1608 n = a->i[i + 1] - a->i[i]; 1609 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1610 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1611 for (j = 0; j < n; j++) { 1612 const PetscScalar aatmp = aa[j]; 1613 const PetscInt ajtmp = aj[j]; 1614 r1 += aatmp * b1[ajtmp]; 1615 r2 += aatmp * b2[ajtmp]; 1616 r3 += aatmp * b3[ajtmp]; 1617 } 1618 if (add) { 1619 c1[i] += r1; 1620 c2[i] += r2; 1621 c3[i] += r3; 1622 } else { 1623 c1[i] = r1; 1624 c2[i] = r2; 1625 c3[i] = r3; 1626 } 1627 } 1628 } 1629 } 1630 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1631 if (add) { 1632 PetscCall(MatDenseRestoreArray(C, &c)); 1633 } else { 1634 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1635 } 1636 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1637 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1642 { 1643 PetscFunctionBegin; 1644 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1645 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1646 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1647 1648 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1649 PetscFunctionReturn(PETSC_SUCCESS); 1650 } 1651 1652 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1653 { 1654 PetscFunctionBegin; 1655 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1656 C->ops->productsymbolic = MatProductSymbolic_AB; 1657 PetscFunctionReturn(PETSC_SUCCESS); 1658 } 1659 1660 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1661 1662 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1663 { 1664 PetscFunctionBegin; 1665 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1666 C->ops->productsymbolic = MatProductSymbolic_AtB; 1667 PetscFunctionReturn(PETSC_SUCCESS); 1668 } 1669 1670 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1671 { 1672 PetscFunctionBegin; 1673 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1674 C->ops->productsymbolic = MatProductSymbolic_ABt; 1675 PetscFunctionReturn(PETSC_SUCCESS); 1676 } 1677 1678 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1679 { 1680 Mat_Product *product = C->product; 1681 1682 PetscFunctionBegin; 1683 switch (product->type) { 1684 case MATPRODUCT_AB: 1685 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1686 break; 1687 case MATPRODUCT_AtB: 1688 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1689 break; 1690 case MATPRODUCT_ABt: 1691 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1692 break; 1693 default: 1694 break; 1695 } 1696 PetscFunctionReturn(PETSC_SUCCESS); 1697 } 1698 1699 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1700 { 1701 Mat_Product *product = C->product; 1702 Mat A = product->A; 1703 PetscBool baij; 1704 1705 PetscFunctionBegin; 1706 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1707 if (!baij) { /* A is seqsbaij */ 1708 PetscBool sbaij; 1709 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1710 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1711 1712 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1713 } else { /* A is seqbaij */ 1714 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1715 } 1716 1717 C->ops->productsymbolic = MatProductSymbolic_AB; 1718 PetscFunctionReturn(PETSC_SUCCESS); 1719 } 1720 1721 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1722 { 1723 Mat_Product *product = C->product; 1724 1725 PetscFunctionBegin; 1726 MatCheckProduct(C, 1); 1727 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1728 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1729 PetscFunctionReturn(PETSC_SUCCESS); 1730 } 1731 1732 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1733 { 1734 PetscFunctionBegin; 1735 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1736 C->ops->productsymbolic = MatProductSymbolic_AB; 1737 PetscFunctionReturn(PETSC_SUCCESS); 1738 } 1739 1740 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1741 { 1742 Mat_Product *product = C->product; 1743 1744 PetscFunctionBegin; 1745 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1746 PetscFunctionReturn(PETSC_SUCCESS); 1747 } 1748 1749 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1750 { 1751 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1752 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1753 PetscInt *bi = b->i, *bj = b->j; 1754 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1755 MatScalar *btval, *btval_den, *ba = b->a; 1756 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1757 1758 PetscFunctionBegin; 1759 btval_den = btdense->v; 1760 PetscCall(PetscArrayzero(btval_den, m * n)); 1761 for (k = 0; k < ncolors; k++) { 1762 ncolumns = coloring->ncolumns[k]; 1763 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1764 col = *(columns + colorforcol[k] + l); 1765 btcol = bj + bi[col]; 1766 btval = ba + bi[col]; 1767 anz = bi[col + 1] - bi[col]; 1768 for (j = 0; j < anz; j++) { 1769 brow = btcol[j]; 1770 btval_den[brow] = btval[j]; 1771 } 1772 } 1773 btval_den += m; 1774 } 1775 PetscFunctionReturn(PETSC_SUCCESS); 1776 } 1777 1778 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1779 { 1780 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1781 const PetscScalar *ca_den, *ca_den_ptr; 1782 PetscScalar *ca = csp->a; 1783 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1784 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1785 PetscInt nrows, *row, *idx; 1786 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1787 1788 PetscFunctionBegin; 1789 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1790 1791 if (brows > 0) { 1792 PetscInt *lstart, row_end, row_start; 1793 lstart = matcoloring->lstart; 1794 PetscCall(PetscArrayzero(lstart, ncolors)); 1795 1796 row_end = brows; 1797 if (row_end > m) row_end = m; 1798 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1799 ca_den_ptr = ca_den; 1800 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1801 nrows = matcoloring->nrows[k]; 1802 row = rows + colorforrow[k]; 1803 idx = den2sp + colorforrow[k]; 1804 for (l = lstart[k]; l < nrows; l++) { 1805 if (row[l] >= row_end) { 1806 lstart[k] = l; 1807 break; 1808 } else { 1809 ca[idx[l]] = ca_den_ptr[row[l]]; 1810 } 1811 } 1812 ca_den_ptr += m; 1813 } 1814 row_end += brows; 1815 if (row_end > m) row_end = m; 1816 } 1817 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1818 ca_den_ptr = ca_den; 1819 for (k = 0; k < ncolors; k++) { 1820 nrows = matcoloring->nrows[k]; 1821 row = rows + colorforrow[k]; 1822 idx = den2sp + colorforrow[k]; 1823 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1824 ca_den_ptr += m; 1825 } 1826 } 1827 1828 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1829 #if defined(PETSC_USE_INFO) 1830 if (matcoloring->brows > 0) { 1831 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1832 } else { 1833 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1834 } 1835 #endif 1836 PetscFunctionReturn(PETSC_SUCCESS); 1837 } 1838 1839 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1840 { 1841 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1842 const PetscInt *is, *ci, *cj, *row_idx; 1843 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1844 IS *isa; 1845 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1846 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1847 PetscInt *colorforcol, *columns, *columns_i, brows; 1848 PetscBool flg; 1849 1850 PetscFunctionBegin; 1851 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1852 1853 /* bs >1 is not being tested yet! */ 1854 Nbs = mat->cmap->N / bs; 1855 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1856 c->N = Nbs; 1857 c->m = c->M; 1858 c->rstart = 0; 1859 c->brows = 100; 1860 1861 c->ncolors = nis; 1862 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1863 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1864 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1865 1866 brows = c->brows; 1867 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1868 if (flg) c->brows = brows; 1869 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1870 1871 colorforrow[0] = 0; 1872 rows_i = rows; 1873 den2sp_i = den2sp; 1874 1875 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1876 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1877 1878 colorforcol[0] = 0; 1879 columns_i = columns; 1880 1881 /* get column-wise storage of mat */ 1882 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1883 1884 cm = c->m; 1885 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1886 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1887 for (i = 0; i < nis; i++) { /* loop over color */ 1888 PetscCall(ISGetLocalSize(isa[i], &n)); 1889 PetscCall(ISGetIndices(isa[i], &is)); 1890 1891 c->ncolumns[i] = n; 1892 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1893 colorforcol[i + 1] = colorforcol[i] + n; 1894 columns_i += n; 1895 1896 /* fast, crude version requires O(N*N) work */ 1897 PetscCall(PetscArrayzero(rowhit, cm)); 1898 1899 for (j = 0; j < n; j++) { /* loop over columns*/ 1900 col = is[j]; 1901 row_idx = cj + ci[col]; 1902 m = ci[col + 1] - ci[col]; 1903 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1904 idxhit[*row_idx] = spidx[ci[col] + k]; 1905 rowhit[*row_idx++] = col + 1; 1906 } 1907 } 1908 /* count the number of hits */ 1909 nrows = 0; 1910 for (j = 0; j < cm; j++) { 1911 if (rowhit[j]) nrows++; 1912 } 1913 c->nrows[i] = nrows; 1914 colorforrow[i + 1] = colorforrow[i] + nrows; 1915 1916 nrows = 0; 1917 for (j = 0; j < cm; j++) { /* loop over rows */ 1918 if (rowhit[j]) { 1919 rows_i[nrows] = j; 1920 den2sp_i[nrows] = idxhit[j]; 1921 nrows++; 1922 } 1923 } 1924 den2sp_i += nrows; 1925 1926 PetscCall(ISRestoreIndices(isa[i], &is)); 1927 rows_i += nrows; 1928 } 1929 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1930 PetscCall(PetscFree(rowhit)); 1931 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1932 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1933 1934 c->colorforrow = colorforrow; 1935 c->rows = rows; 1936 c->den2sp = den2sp; 1937 c->colorforcol = colorforcol; 1938 c->columns = columns; 1939 1940 PetscCall(PetscFree(idxhit)); 1941 PetscFunctionReturn(PETSC_SUCCESS); 1942 } 1943 1944 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1945 { 1946 Mat_Product *product = C->product; 1947 Mat A = product->A, B = product->B; 1948 1949 PetscFunctionBegin; 1950 if (C->ops->mattransposemultnumeric) { 1951 /* Alg: "outerproduct" */ 1952 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1953 } else { 1954 /* Alg: "matmatmult" -- C = At*B */ 1955 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1956 1957 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1958 if (atb->At) { 1959 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1960 user may have called MatProductReplaceMats() to get this A=product->A */ 1961 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1962 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1963 } 1964 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1965 } 1966 PetscFunctionReturn(PETSC_SUCCESS); 1967 } 1968 1969 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1970 { 1971 Mat_Product *product = C->product; 1972 Mat A = product->A, B = product->B; 1973 PetscReal fill = product->fill; 1974 1975 PetscFunctionBegin; 1976 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1977 1978 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1983 { 1984 Mat_Product *product = C->product; 1985 PetscInt alg = 0; /* default algorithm */ 1986 PetscBool flg = PETSC_FALSE; 1987 #if !defined(PETSC_HAVE_HYPRE) 1988 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1989 PetscInt nalg = 7; 1990 #else 1991 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1992 PetscInt nalg = 8; 1993 #endif 1994 1995 PetscFunctionBegin; 1996 /* Set default algorithm */ 1997 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 1998 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1999 2000 /* Get runtime option */ 2001 if (product->api_user) { 2002 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2003 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2004 PetscOptionsEnd(); 2005 } else { 2006 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2007 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2008 PetscOptionsEnd(); 2009 } 2010 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2011 2012 C->ops->productsymbolic = MatProductSymbolic_AB; 2013 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2014 PetscFunctionReturn(PETSC_SUCCESS); 2015 } 2016 2017 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2018 { 2019 Mat_Product *product = C->product; 2020 PetscInt alg = 0; /* default algorithm */ 2021 PetscBool flg = PETSC_FALSE; 2022 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2023 PetscInt nalg = 3; 2024 2025 PetscFunctionBegin; 2026 /* Get runtime option */ 2027 if (product->api_user) { 2028 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2029 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2030 PetscOptionsEnd(); 2031 } else { 2032 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2033 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2034 PetscOptionsEnd(); 2035 } 2036 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2037 2038 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2039 PetscFunctionReturn(PETSC_SUCCESS); 2040 } 2041 2042 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2043 { 2044 Mat_Product *product = C->product; 2045 PetscInt alg = 0; /* default algorithm */ 2046 PetscBool flg = PETSC_FALSE; 2047 const char *algTypes[2] = {"default", "color"}; 2048 PetscInt nalg = 2; 2049 2050 PetscFunctionBegin; 2051 /* Set default algorithm */ 2052 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2053 if (!flg) { 2054 alg = 1; 2055 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2056 } 2057 2058 /* Get runtime option */ 2059 if (product->api_user) { 2060 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2061 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2062 PetscOptionsEnd(); 2063 } else { 2064 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2065 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2066 PetscOptionsEnd(); 2067 } 2068 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2069 2070 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2071 C->ops->productsymbolic = MatProductSymbolic_ABt; 2072 PetscFunctionReturn(PETSC_SUCCESS); 2073 } 2074 2075 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2076 { 2077 Mat_Product *product = C->product; 2078 PetscBool flg = PETSC_FALSE; 2079 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2080 #if !defined(PETSC_HAVE_HYPRE) 2081 const char *algTypes[2] = {"scalable", "rap"}; 2082 PetscInt nalg = 2; 2083 #else 2084 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2085 PetscInt nalg = 3; 2086 #endif 2087 2088 PetscFunctionBegin; 2089 /* Set default algorithm */ 2090 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2091 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2092 2093 /* Get runtime option */ 2094 if (product->api_user) { 2095 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2096 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2097 PetscOptionsEnd(); 2098 } else { 2099 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2100 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2101 PetscOptionsEnd(); 2102 } 2103 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2104 2105 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2106 PetscFunctionReturn(PETSC_SUCCESS); 2107 } 2108 2109 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2110 { 2111 Mat_Product *product = C->product; 2112 PetscBool flg = PETSC_FALSE; 2113 PetscInt alg = 0; /* default algorithm */ 2114 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2115 PetscInt nalg = 3; 2116 2117 PetscFunctionBegin; 2118 /* Set default algorithm */ 2119 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2120 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2121 2122 /* Get runtime option */ 2123 if (product->api_user) { 2124 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2125 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2126 PetscOptionsEnd(); 2127 } else { 2128 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2129 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2130 PetscOptionsEnd(); 2131 } 2132 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2133 2134 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2135 PetscFunctionReturn(PETSC_SUCCESS); 2136 } 2137 2138 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2139 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2140 { 2141 Mat_Product *product = C->product; 2142 PetscInt alg = 0; /* default algorithm */ 2143 PetscBool flg = PETSC_FALSE; 2144 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2145 PetscInt nalg = 7; 2146 2147 PetscFunctionBegin; 2148 /* Set default algorithm */ 2149 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2150 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2151 2152 /* Get runtime option */ 2153 if (product->api_user) { 2154 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2155 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2156 PetscOptionsEnd(); 2157 } else { 2158 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2159 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2160 PetscOptionsEnd(); 2161 } 2162 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2163 2164 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2165 C->ops->productsymbolic = MatProductSymbolic_ABC; 2166 PetscFunctionReturn(PETSC_SUCCESS); 2167 } 2168 2169 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2170 { 2171 Mat_Product *product = C->product; 2172 2173 PetscFunctionBegin; 2174 switch (product->type) { 2175 case MATPRODUCT_AB: 2176 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2177 break; 2178 case MATPRODUCT_AtB: 2179 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2180 break; 2181 case MATPRODUCT_ABt: 2182 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2183 break; 2184 case MATPRODUCT_PtAP: 2185 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2186 break; 2187 case MATPRODUCT_RARt: 2188 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2189 break; 2190 case MATPRODUCT_ABC: 2191 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2192 break; 2193 default: 2194 break; 2195 } 2196 PetscFunctionReturn(PETSC_SUCCESS); 2197 } 2198