xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 9a6bacb05cc5a54be4cd68af72f4fbfbc48b3cb3)
1 /*
2   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3           C = A * B
4 */
5 
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <petscbt.h>
9 #include <petsc/private/isimpl.h>
10 #include <../src/mat/impls/dense/seq/dense.h>
11 
12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13 {
14   PetscFunctionBegin;
15   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
17   PetscFunctionReturn(PETSC_SUCCESS);
18 }
19 
20 /* Modified from MatCreateSeqAIJWithArrays() */
21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22 {
23   PetscInt    ii;
24   Mat_SeqAIJ *aij;
25   PetscBool   isseqaij, ofree_a, ofree_ij;
26 
27   PetscFunctionBegin;
28   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
29   PetscCall(MatSetSizes(mat, m, n, m, n));
30 
31   if (!mtype) {
32     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
33     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34   } else {
35     PetscCall(MatSetType(mat, mtype));
36   }
37 
38   aij      = (Mat_SeqAIJ *)(mat)->data;
39   ofree_a  = aij->free_a;
40   ofree_ij = aij->free_ij;
41   /* changes the free flags */
42   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43 
44   PetscCall(PetscFree(aij->ilen));
45   PetscCall(PetscFree(aij->imax));
46   PetscCall(PetscMalloc1(m, &aij->imax));
47   PetscCall(PetscMalloc1(m, &aij->ilen));
48   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49     const PetscInt rnz = i[ii + 1] - i[ii];
50     aij->nonzerorowcnt += !!rnz;
51     aij->rmax     = PetscMax(aij->rmax, rnz);
52     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53   }
54   aij->maxnz = i[m];
55   aij->nz    = i[m];
56 
57   if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
58   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
59   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
60 
61   aij->i       = i;
62   aij->j       = j;
63   aij->a       = a;
64   aij->nonew   = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
65   aij->free_a  = PETSC_FALSE;
66   aij->free_ij = PETSC_FALSE;
67   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
68   // Always build the diag info when i, j are set
69   PetscCall(MatMarkDiagonal_SeqAIJ(mat));
70   PetscFunctionReturn(PETSC_SUCCESS);
71 }
72 
73 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
74 {
75   Mat_Product        *product = C->product;
76   MatProductAlgorithm alg;
77   PetscBool           flg;
78 
79   PetscFunctionBegin;
80   if (product) {
81     alg = product->alg;
82   } else {
83     alg = "sorted";
84   }
85   /* sorted */
86   PetscCall(PetscStrcmp(alg, "sorted", &flg));
87   if (flg) {
88     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
89     PetscFunctionReturn(PETSC_SUCCESS);
90   }
91 
92   /* scalable */
93   PetscCall(PetscStrcmp(alg, "scalable", &flg));
94   if (flg) {
95     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
96     PetscFunctionReturn(PETSC_SUCCESS);
97   }
98 
99   /* scalable_fast */
100   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
101   if (flg) {
102     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
103     PetscFunctionReturn(PETSC_SUCCESS);
104   }
105 
106   /* heap */
107   PetscCall(PetscStrcmp(alg, "heap", &flg));
108   if (flg) {
109     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
110     PetscFunctionReturn(PETSC_SUCCESS);
111   }
112 
113   /* btheap */
114   PetscCall(PetscStrcmp(alg, "btheap", &flg));
115   if (flg) {
116     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
117     PetscFunctionReturn(PETSC_SUCCESS);
118   }
119 
120   /* llcondensed */
121   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
122   if (flg) {
123     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
124     PetscFunctionReturn(PETSC_SUCCESS);
125   }
126 
127   /* rowmerge */
128   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
129   if (flg) {
130     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
131     PetscFunctionReturn(PETSC_SUCCESS);
132   }
133 
134 #if defined(PETSC_HAVE_HYPRE)
135   PetscCall(PetscStrcmp(alg, "hypre", &flg));
136   if (flg) {
137     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
138     PetscFunctionReturn(PETSC_SUCCESS);
139   }
140 #endif
141 
142   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
143 }
144 
145 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
146 {
147   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
148   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
149   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
150   PetscReal          afill;
151   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
152   PetscHMapI         ta;
153   PetscBT            lnkbt;
154   PetscFreeSpaceList free_space = NULL, current_space = NULL;
155 
156   PetscFunctionBegin;
157   /* Get ci and cj */
158   /* Allocate ci array, arrays for fill computation and */
159   /* free space for accumulating nonzero column info */
160   PetscCall(PetscMalloc1(am + 2, &ci));
161   ci[0] = 0;
162 
163   /* create and initialize a linked list */
164   PetscCall(PetscHMapICreateWithSize(bn, &ta));
165   MatRowMergeMax_SeqAIJ(b, bm, ta);
166   PetscCall(PetscHMapIGetSize(ta, &Crmax));
167   PetscCall(PetscHMapIDestroy(&ta));
168 
169   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
170 
171   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
172   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
173 
174   current_space = free_space;
175 
176   /* Determine ci and cj */
177   for (i = 0; i < am; i++) {
178     anzi = ai[i + 1] - ai[i];
179     aj   = a->j + ai[i];
180     for (j = 0; j < anzi; j++) {
181       brow = aj[j];
182       bnzj = bi[brow + 1] - bi[brow];
183       bj   = b->j + bi[brow];
184       /* add non-zero cols of B into the sorted linked list lnk */
185       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
186     }
187     /* add possible missing diagonal entry */
188     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
189     cnzi = lnk[0];
190 
191     /* If free space is not available, make more free space */
192     /* Double the amount of total space in the list */
193     if (current_space->local_remaining < cnzi) {
194       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
195       ndouble++;
196     }
197 
198     /* Copy data into free space, then initialize lnk */
199     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
200 
201     current_space->array += cnzi;
202     current_space->local_used += cnzi;
203     current_space->local_remaining -= cnzi;
204 
205     ci[i + 1] = ci[i] + cnzi;
206   }
207 
208   /* Column indices are in the list of free space */
209   /* Allocate space for cj, initialize cj, and */
210   /* destroy list of free space and other temporary array(s) */
211   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
212   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
213   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
214 
215   /* put together the new symbolic matrix */
216   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
217   PetscCall(MatSetBlockSizesFromMats(C, A, B));
218 
219   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
220   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
221   c          = (Mat_SeqAIJ *)C->data;
222   c->free_a  = PETSC_FALSE;
223   c->free_ij = PETSC_TRUE;
224   c->nonew   = 0;
225 
226   /* fast, needs non-scalable O(bn) array 'abdense' */
227   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
228 
229   /* set MatInfo */
230   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
231   if (afill < 1.0) afill = 1.0;
232   C->info.mallocs           = ndouble;
233   C->info.fill_ratio_given  = fill;
234   C->info.fill_ratio_needed = afill;
235 
236 #if defined(PETSC_USE_INFO)
237   if (ci[am]) {
238     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
239     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
240   } else {
241     PetscCall(PetscInfo(C, "Empty matrix product\n"));
242   }
243 #endif
244   PetscFunctionReturn(PETSC_SUCCESS);
245 }
246 
247 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
248 {
249   PetscLogDouble     flops = 0.0;
250   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
251   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
252   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
253   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
254   PetscInt           am = A->rmap->n, cm = C->rmap->n;
255   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
256   PetscScalar       *ca, valtmp;
257   PetscScalar       *ab_dense;
258   PetscContainer     cab_dense;
259   const PetscScalar *aa, *ba, *baj;
260 
261   PetscFunctionBegin;
262   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
263   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
264   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
265     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
266     c->a      = ca;
267     c->free_a = PETSC_TRUE;
268   } else ca = c->a;
269 
270   /* TODO this should be done in the symbolic phase */
271   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
272      that is hard to eradicate) */
273   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
274   if (!cab_dense) {
275     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
276     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
277     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
278     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
279     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
280     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
281   }
282   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
283   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
284 
285   /* clean old values in C */
286   PetscCall(PetscArrayzero(ca, ci[cm]));
287   /* Traverse A row-wise. */
288   /* Build the ith row in C by summing over nonzero columns in A, */
289   /* the rows of B corresponding to nonzeros of A. */
290   for (i = 0; i < am; i++) {
291     anzi = ai[i + 1] - ai[i];
292     for (j = 0; j < anzi; j++) {
293       brow = aj[j];
294       bnzi = bi[brow + 1] - bi[brow];
295       bjj  = PetscSafePointerPlusOffset(bj, bi[brow]);
296       baj  = PetscSafePointerPlusOffset(ba, bi[brow]);
297       /* perform dense axpy */
298       valtmp = aa[j];
299       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
300       flops += 2 * bnzi;
301     }
302     aj = PetscSafePointerPlusOffset(aj, anzi);
303     aa = PetscSafePointerPlusOffset(aa, anzi);
304 
305     cnzi = ci[i + 1] - ci[i];
306     for (k = 0; k < cnzi; k++) {
307       ca[k] += ab_dense[cj[k]];
308       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
309     }
310     flops += cnzi;
311     cj = PetscSafePointerPlusOffset(cj, cnzi);
312     ca += cnzi;
313   }
314 #if defined(PETSC_HAVE_DEVICE)
315   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
316 #endif
317   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
318   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
319   PetscCall(PetscLogFlops(flops));
320   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
321   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
322   PetscFunctionReturn(PETSC_SUCCESS);
323 }
324 
325 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
326 {
327   PetscLogDouble     flops = 0.0;
328   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
329   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
330   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
331   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
332   PetscInt           am = A->rmap->N, cm = C->rmap->N;
333   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
334   PetscScalar       *ca = c->a, valtmp;
335   const PetscScalar *aa, *ba, *baj;
336   PetscInt           nextb;
337 
338   PetscFunctionBegin;
339   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
340   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
341   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
342     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
343     c->a      = ca;
344     c->free_a = PETSC_TRUE;
345   }
346 
347   /* clean old values in C */
348   PetscCall(PetscArrayzero(ca, ci[cm]));
349   /* Traverse A row-wise. */
350   /* Build the ith row in C by summing over nonzero columns in A, */
351   /* the rows of B corresponding to nonzeros of A. */
352   for (i = 0; i < am; i++) {
353     anzi = ai[i + 1] - ai[i];
354     cnzi = ci[i + 1] - ci[i];
355     for (j = 0; j < anzi; j++) {
356       brow = aj[j];
357       bnzi = bi[brow + 1] - bi[brow];
358       bjj  = bj + bi[brow];
359       baj  = ba + bi[brow];
360       /* perform sparse axpy */
361       valtmp = aa[j];
362       nextb  = 0;
363       for (k = 0; nextb < bnzi; k++) {
364         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
365           ca[k] += valtmp * baj[nextb++];
366         }
367       }
368       flops += 2 * bnzi;
369     }
370     aj += anzi;
371     aa += anzi;
372     cj += cnzi;
373     ca += cnzi;
374   }
375 #if defined(PETSC_HAVE_DEVICE)
376   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
377 #endif
378   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
379   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
380   PetscCall(PetscLogFlops(flops));
381   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
382   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
387 {
388   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
389   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
390   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
391   MatScalar         *ca;
392   PetscReal          afill;
393   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
394   PetscHMapI         ta;
395   PetscFreeSpaceList free_space = NULL, current_space = NULL;
396 
397   PetscFunctionBegin;
398   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
399   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
400   PetscCall(PetscMalloc1(am + 2, &ci));
401   ci[0] = 0;
402 
403   /* create and initialize a linked list */
404   PetscCall(PetscHMapICreateWithSize(bn, &ta));
405   MatRowMergeMax_SeqAIJ(b, bm, ta);
406   PetscCall(PetscHMapIGetSize(ta, &Crmax));
407   PetscCall(PetscHMapIDestroy(&ta));
408 
409   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
410 
411   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
412   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
413   current_space = free_space;
414 
415   /* Determine ci and cj */
416   for (i = 0; i < am; i++) {
417     anzi = ai[i + 1] - ai[i];
418     aj   = a->j + ai[i];
419     for (j = 0; j < anzi; j++) {
420       brow = aj[j];
421       bnzj = bi[brow + 1] - bi[brow];
422       bj   = b->j + bi[brow];
423       /* add non-zero cols of B into the sorted linked list lnk */
424       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
425     }
426     /* add possible missing diagonal entry */
427     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
428     cnzi = lnk[1];
429 
430     /* If free space is not available, make more free space */
431     /* Double the amount of total space in the list */
432     if (current_space->local_remaining < cnzi) {
433       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
434       ndouble++;
435     }
436 
437     /* Copy data into free space, then initialize lnk */
438     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
439 
440     current_space->array += cnzi;
441     current_space->local_used += cnzi;
442     current_space->local_remaining -= cnzi;
443 
444     ci[i + 1] = ci[i] + cnzi;
445   }
446 
447   /* Column indices are in the list of free space */
448   /* Allocate space for cj, initialize cj, and */
449   /* destroy list of free space and other temporary array(s) */
450   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
451   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
452   PetscCall(PetscLLCondensedDestroy_fast(lnk));
453 
454   /* Allocate space for ca */
455   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
456 
457   /* put together the new symbolic matrix */
458   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
459   PetscCall(MatSetBlockSizesFromMats(C, A, B));
460 
461   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
462   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
463   c          = (Mat_SeqAIJ *)C->data;
464   c->free_a  = PETSC_TRUE;
465   c->free_ij = PETSC_TRUE;
466   c->nonew   = 0;
467 
468   /* slower, less memory */
469   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
470 
471   /* set MatInfo */
472   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
473   if (afill < 1.0) afill = 1.0;
474   C->info.mallocs           = ndouble;
475   C->info.fill_ratio_given  = fill;
476   C->info.fill_ratio_needed = afill;
477 
478 #if defined(PETSC_USE_INFO)
479   if (ci[am]) {
480     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
481     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
482   } else {
483     PetscCall(PetscInfo(C, "Empty matrix product\n"));
484   }
485 #endif
486   PetscFunctionReturn(PETSC_SUCCESS);
487 }
488 
489 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
490 {
491   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
492   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
493   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
494   MatScalar         *ca;
495   PetscReal          afill;
496   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
497   PetscHMapI         ta;
498   PetscFreeSpaceList free_space = NULL, current_space = NULL;
499 
500   PetscFunctionBegin;
501   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
502   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
503   PetscCall(PetscMalloc1(am + 2, &ci));
504   ci[0] = 0;
505 
506   /* create and initialize a linked list */
507   PetscCall(PetscHMapICreateWithSize(bn, &ta));
508   MatRowMergeMax_SeqAIJ(b, bm, ta);
509   PetscCall(PetscHMapIGetSize(ta, &Crmax));
510   PetscCall(PetscHMapIDestroy(&ta));
511   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512 
513   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
514   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
515   current_space = free_space;
516 
517   /* Determine ci and cj */
518   for (i = 0; i < am; i++) {
519     anzi = ai[i + 1] - ai[i];
520     aj   = a->j + ai[i];
521     for (j = 0; j < anzi; j++) {
522       brow = aj[j];
523       bnzj = bi[brow + 1] - bi[brow];
524       bj   = b->j + bi[brow];
525       /* add non-zero cols of B into the sorted linked list lnk */
526       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
527     }
528     /* add possible missing diagonal entry */
529     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
530 
531     cnzi = lnk[0];
532 
533     /* If free space is not available, make more free space */
534     /* Double the amount of total space in the list */
535     if (current_space->local_remaining < cnzi) {
536       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
537       ndouble++;
538     }
539 
540     /* Copy data into free space, then initialize lnk */
541     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
542 
543     current_space->array += cnzi;
544     current_space->local_used += cnzi;
545     current_space->local_remaining -= cnzi;
546 
547     ci[i + 1] = ci[i] + cnzi;
548   }
549 
550   /* Column indices are in the list of free space */
551   /* Allocate space for cj, initialize cj, and */
552   /* destroy list of free space and other temporary array(s) */
553   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
554   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
555   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
556 
557   /* Allocate space for ca */
558   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
559 
560   /* put together the new symbolic matrix */
561   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
562   PetscCall(MatSetBlockSizesFromMats(C, A, B));
563 
564   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
565   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
566   c          = (Mat_SeqAIJ *)C->data;
567   c->free_a  = PETSC_TRUE;
568   c->free_ij = PETSC_TRUE;
569   c->nonew   = 0;
570 
571   /* slower, less memory */
572   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
573 
574   /* set MatInfo */
575   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
576   if (afill < 1.0) afill = 1.0;
577   C->info.mallocs           = ndouble;
578   C->info.fill_ratio_given  = fill;
579   C->info.fill_ratio_needed = afill;
580 
581 #if defined(PETSC_USE_INFO)
582   if (ci[am]) {
583     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
584     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
585   } else {
586     PetscCall(PetscInfo(C, "Empty matrix product\n"));
587   }
588 #endif
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 
592 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
593 {
594   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
595   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
596   PetscInt          *ci, *cj, *bb;
597   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
598   PetscReal          afill;
599   PetscInt           i, j, col, ndouble = 0;
600   PetscFreeSpaceList free_space = NULL, current_space = NULL;
601   PetscHeap          h;
602 
603   PetscFunctionBegin;
604   /* Get ci and cj - by merging sorted rows using a heap */
605   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
606   PetscCall(PetscMalloc1(am + 2, &ci));
607   ci[0] = 0;
608 
609   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
610   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
611   current_space = free_space;
612 
613   PetscCall(PetscHeapCreate(a->rmax, &h));
614   PetscCall(PetscMalloc1(a->rmax, &bb));
615 
616   /* Determine ci and cj */
617   for (i = 0; i < am; i++) {
618     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
619     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
620     ci[i + 1]            = ci[i];
621     /* Populate the min heap */
622     for (j = 0; j < anzi; j++) {
623       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
624       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
625         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
626       }
627     }
628     /* Pick off the min element, adding it to free space */
629     PetscCall(PetscHeapPop(h, &j, &col));
630     while (j >= 0) {
631       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
632         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
633         ndouble++;
634       }
635       *(current_space->array++) = col;
636       current_space->local_used++;
637       current_space->local_remaining--;
638       ci[i + 1]++;
639 
640       /* stash if anything else remains in this row of B */
641       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
642       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
643         PetscInt j2, col2;
644         PetscCall(PetscHeapPeek(h, &j2, &col2));
645         if (col2 != col) break;
646         PetscCall(PetscHeapPop(h, &j2, &col2));
647         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
648       }
649       /* Put any stashed elements back into the min heap */
650       PetscCall(PetscHeapUnstash(h));
651       PetscCall(PetscHeapPop(h, &j, &col));
652     }
653   }
654   PetscCall(PetscFree(bb));
655   PetscCall(PetscHeapDestroy(&h));
656 
657   /* Column indices are in the list of free space */
658   /* Allocate space for cj, initialize cj, and */
659   /* destroy list of free space and other temporary array(s) */
660   PetscCall(PetscMalloc1(ci[am], &cj));
661   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
662 
663   /* put together the new symbolic matrix */
664   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
665   PetscCall(MatSetBlockSizesFromMats(C, A, B));
666 
667   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
668   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
669   c          = (Mat_SeqAIJ *)C->data;
670   c->free_a  = PETSC_TRUE;
671   c->free_ij = PETSC_TRUE;
672   c->nonew   = 0;
673 
674   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
675 
676   /* set MatInfo */
677   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
678   if (afill < 1.0) afill = 1.0;
679   C->info.mallocs           = ndouble;
680   C->info.fill_ratio_given  = fill;
681   C->info.fill_ratio_needed = afill;
682 
683 #if defined(PETSC_USE_INFO)
684   if (ci[am]) {
685     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
686     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
687   } else {
688     PetscCall(PetscInfo(C, "Empty matrix product\n"));
689   }
690 #endif
691   PetscFunctionReturn(PETSC_SUCCESS);
692 }
693 
694 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
695 {
696   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
697   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
698   PetscInt          *ci, *cj, *bb;
699   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
700   PetscReal          afill;
701   PetscInt           i, j, col, ndouble = 0;
702   PetscFreeSpaceList free_space = NULL, current_space = NULL;
703   PetscHeap          h;
704   PetscBT            bt;
705 
706   PetscFunctionBegin;
707   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
708   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
709   PetscCall(PetscMalloc1(am + 2, &ci));
710   ci[0] = 0;
711 
712   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
713   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
714 
715   current_space = free_space;
716 
717   PetscCall(PetscHeapCreate(a->rmax, &h));
718   PetscCall(PetscMalloc1(a->rmax, &bb));
719   PetscCall(PetscBTCreate(bn, &bt));
720 
721   /* Determine ci and cj */
722   for (i = 0; i < am; i++) {
723     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
724     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
725     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
726     ci[i + 1]            = ci[i];
727     /* Populate the min heap */
728     for (j = 0; j < anzi; j++) {
729       PetscInt brow = acol[j];
730       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
731         PetscInt bcol = bj[bb[j]];
732         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
733           PetscCall(PetscHeapAdd(h, j, bcol));
734           bb[j]++;
735           break;
736         }
737       }
738     }
739     /* Pick off the min element, adding it to free space */
740     PetscCall(PetscHeapPop(h, &j, &col));
741     while (j >= 0) {
742       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
743         fptr = NULL;                            /* need PetscBTMemzero */
744         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
745         ndouble++;
746       }
747       *(current_space->array++) = col;
748       current_space->local_used++;
749       current_space->local_remaining--;
750       ci[i + 1]++;
751 
752       /* stash if anything else remains in this row of B */
753       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
754         PetscInt bcol = bj[bb[j]];
755         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
756           PetscCall(PetscHeapAdd(h, j, bcol));
757           bb[j]++;
758           break;
759         }
760       }
761       PetscCall(PetscHeapPop(h, &j, &col));
762     }
763     if (fptr) { /* Clear the bits for this row */
764       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
765     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
766       PetscCall(PetscBTMemzero(bn, bt));
767     }
768   }
769   PetscCall(PetscFree(bb));
770   PetscCall(PetscHeapDestroy(&h));
771   PetscCall(PetscBTDestroy(&bt));
772 
773   /* Column indices are in the list of free space */
774   /* Allocate space for cj, initialize cj, and */
775   /* destroy list of free space and other temporary array(s) */
776   PetscCall(PetscMalloc1(ci[am], &cj));
777   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
778 
779   /* put together the new symbolic matrix */
780   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
781   PetscCall(MatSetBlockSizesFromMats(C, A, B));
782 
783   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
784   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
785   c          = (Mat_SeqAIJ *)C->data;
786   c->free_a  = PETSC_TRUE;
787   c->free_ij = PETSC_TRUE;
788   c->nonew   = 0;
789 
790   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
791 
792   /* set MatInfo */
793   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
794   if (afill < 1.0) afill = 1.0;
795   C->info.mallocs           = ndouble;
796   C->info.fill_ratio_given  = fill;
797   C->info.fill_ratio_needed = afill;
798 
799 #if defined(PETSC_USE_INFO)
800   if (ci[am]) {
801     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
802     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
803   } else {
804     PetscCall(PetscInfo(C, "Empty matrix product\n"));
805   }
806 #endif
807   PetscFunctionReturn(PETSC_SUCCESS);
808 }
809 
810 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
811 {
812   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
813   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
814   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
815   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
816   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
817   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
818   const PetscInt *brow_ptr[8], *brow_end[8];
819   PetscInt        window[8];
820   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
821   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
822   PetscReal       afill;
823   PetscInt       *workj_L1, *workj_L2, *workj_L3;
824   PetscInt        L1_nnz, L2_nnz;
825 
826   /* Step 1: Get upper bound on memory required for allocation.
827              Because of the way virtual memory works,
828              only the memory pages that are actually needed will be physically allocated. */
829   PetscFunctionBegin;
830   PetscCall(PetscMalloc1(am + 1, &ci));
831   for (i = 0; i < am; i++) {
832     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
833     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
834     a_rownnz             = 0;
835     for (k = 0; k < anzi; ++k) {
836       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
837       if (a_rownnz > bn) {
838         a_rownnz = bn;
839         break;
840       }
841     }
842     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
843   }
844   /* temporary work areas for merging rows */
845   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
846   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
847   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
848 
849   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
850   c_maxmem = 8 * (ai[am] + bi[bm]);
851   /* Step 2: Populate pattern for C */
852   PetscCall(PetscMalloc1(c_maxmem, &cj));
853 
854   ci_nnz      = 0;
855   ci[0]       = 0;
856   worki_L1[0] = 0;
857   worki_L2[0] = 0;
858   for (i = 0; i < am; i++) {
859     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
860     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
861     rowsleft             = anzi;
862     inputcol_L1          = acol;
863     L2_nnz               = 0;
864     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
865     worki_L2[1]          = 0;
866     outputi_nnz          = 0;
867 
868     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
869     while (ci_nnz + a_maxrownnz > c_maxmem) {
870       c_maxmem *= 2;
871       ndouble++;
872       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
873     }
874 
875     while (rowsleft) {
876       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
877       L1_nrows    = 0;
878       L1_nnz      = 0;
879       inputcol    = inputcol_L1;
880       inputi      = bi;
881       inputj      = bj;
882 
883       /* The following macro is used to specialize for small rows in A.
884          This helps with compiler unrolling, improving performance substantially.
885           Input:  inputj   inputi  inputcol  bn
886           Output: outputj  outputi_nnz                       */
887 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
888   do { \
889     window_min  = bn; \
890     outputi_nnz = 0; \
891     for (k = 0; k < ANNZ; ++k) { \
892       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
893       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
894       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
895       window_min  = PetscMin(window[k], window_min); \
896     } \
897     while (window_min < bn) { \
898       outputj[outputi_nnz++] = window_min; \
899       /* advance front and compute new minimum */ \
900       old_window_min = window_min; \
901       window_min     = bn; \
902       for (k = 0; k < ANNZ; ++k) { \
903         if (window[k] == old_window_min) { \
904           brow_ptr[k]++; \
905           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
906         } \
907         window_min = PetscMin(window[k], window_min); \
908       } \
909     } \
910   } while (0)
911 
912       /************** L E V E L  1 ***************/
913       /* Merge up to 8 rows of B to L1 work array*/
914       while (L1_rowsleft) {
915         outputi_nnz = 0;
916         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
917         else outputj = cj + ci_nnz;                /* Merge directly to C */
918 
919         switch (L1_rowsleft) {
920         case 1:
921           brow_ptr[0] = inputj + inputi[inputcol[0]];
922           brow_end[0] = inputj + inputi[inputcol[0] + 1];
923           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
924           inputcol += L1_rowsleft;
925           rowsleft -= L1_rowsleft;
926           L1_rowsleft = 0;
927           break;
928         case 2:
929           MatMatMultSymbolic_RowMergeMacro(2);
930           inputcol += L1_rowsleft;
931           rowsleft -= L1_rowsleft;
932           L1_rowsleft = 0;
933           break;
934         case 3:
935           MatMatMultSymbolic_RowMergeMacro(3);
936           inputcol += L1_rowsleft;
937           rowsleft -= L1_rowsleft;
938           L1_rowsleft = 0;
939           break;
940         case 4:
941           MatMatMultSymbolic_RowMergeMacro(4);
942           inputcol += L1_rowsleft;
943           rowsleft -= L1_rowsleft;
944           L1_rowsleft = 0;
945           break;
946         case 5:
947           MatMatMultSymbolic_RowMergeMacro(5);
948           inputcol += L1_rowsleft;
949           rowsleft -= L1_rowsleft;
950           L1_rowsleft = 0;
951           break;
952         case 6:
953           MatMatMultSymbolic_RowMergeMacro(6);
954           inputcol += L1_rowsleft;
955           rowsleft -= L1_rowsleft;
956           L1_rowsleft = 0;
957           break;
958         case 7:
959           MatMatMultSymbolic_RowMergeMacro(7);
960           inputcol += L1_rowsleft;
961           rowsleft -= L1_rowsleft;
962           L1_rowsleft = 0;
963           break;
964         default:
965           MatMatMultSymbolic_RowMergeMacro(8);
966           inputcol += 8;
967           rowsleft -= 8;
968           L1_rowsleft -= 8;
969           break;
970         }
971         inputcol_L1 = inputcol;
972         L1_nnz += outputi_nnz;
973         worki_L1[++L1_nrows] = L1_nnz;
974       }
975 
976       /********************** L E V E L  2 ************************/
977       /* Merge from L1 work array to either C or to L2 work array */
978       if (anzi > 8) {
979         inputi      = worki_L1;
980         inputj      = workj_L1;
981         inputcol    = workcol;
982         outputi_nnz = 0;
983 
984         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
985         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
986 
987         switch (L1_nrows) {
988         case 1:
989           brow_ptr[0] = inputj + inputi[inputcol[0]];
990           brow_end[0] = inputj + inputi[inputcol[0] + 1];
991           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
992           break;
993         case 2:
994           MatMatMultSymbolic_RowMergeMacro(2);
995           break;
996         case 3:
997           MatMatMultSymbolic_RowMergeMacro(3);
998           break;
999         case 4:
1000           MatMatMultSymbolic_RowMergeMacro(4);
1001           break;
1002         case 5:
1003           MatMatMultSymbolic_RowMergeMacro(5);
1004           break;
1005         case 6:
1006           MatMatMultSymbolic_RowMergeMacro(6);
1007           break;
1008         case 7:
1009           MatMatMultSymbolic_RowMergeMacro(7);
1010           break;
1011         case 8:
1012           MatMatMultSymbolic_RowMergeMacro(8);
1013           break;
1014         default:
1015           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1016         }
1017         L2_nnz += outputi_nnz;
1018         worki_L2[++L2_nrows] = L2_nnz;
1019 
1020         /************************ L E V E L  3 **********************/
1021         /* Merge from L2 work array to either C or to L2 work array */
1022         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1023           inputi      = worki_L2;
1024           inputj      = workj_L2;
1025           inputcol    = workcol;
1026           outputi_nnz = 0;
1027           if (rowsleft) outputj = workj_L3;
1028           else outputj = cj + ci_nnz;
1029           switch (L2_nrows) {
1030           case 1:
1031             brow_ptr[0] = inputj + inputi[inputcol[0]];
1032             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1033             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1034             break;
1035           case 2:
1036             MatMatMultSymbolic_RowMergeMacro(2);
1037             break;
1038           case 3:
1039             MatMatMultSymbolic_RowMergeMacro(3);
1040             break;
1041           case 4:
1042             MatMatMultSymbolic_RowMergeMacro(4);
1043             break;
1044           case 5:
1045             MatMatMultSymbolic_RowMergeMacro(5);
1046             break;
1047           case 6:
1048             MatMatMultSymbolic_RowMergeMacro(6);
1049             break;
1050           case 7:
1051             MatMatMultSymbolic_RowMergeMacro(7);
1052             break;
1053           case 8:
1054             MatMatMultSymbolic_RowMergeMacro(8);
1055             break;
1056           default:
1057             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1058           }
1059           L2_nrows    = 1;
1060           L2_nnz      = outputi_nnz;
1061           worki_L2[1] = outputi_nnz;
1062           /* Copy to workj_L2 */
1063           if (rowsleft) {
1064             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1065           }
1066         }
1067       }
1068     } /* while (rowsleft) */
1069 #undef MatMatMultSymbolic_RowMergeMacro
1070 
1071     /* terminate current row */
1072     ci_nnz += outputi_nnz;
1073     ci[i + 1] = ci_nnz;
1074   }
1075 
1076   /* Step 3: Create the new symbolic matrix */
1077   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1078   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1079 
1080   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1081   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1082   c          = (Mat_SeqAIJ *)C->data;
1083   c->free_a  = PETSC_TRUE;
1084   c->free_ij = PETSC_TRUE;
1085   c->nonew   = 0;
1086 
1087   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1088 
1089   /* set MatInfo */
1090   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1091   if (afill < 1.0) afill = 1.0;
1092   C->info.mallocs           = ndouble;
1093   C->info.fill_ratio_given  = fill;
1094   C->info.fill_ratio_needed = afill;
1095 
1096 #if defined(PETSC_USE_INFO)
1097   if (ci[am]) {
1098     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1099     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1100   } else {
1101     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1102   }
1103 #endif
1104 
1105   /* Step 4: Free temporary work areas */
1106   PetscCall(PetscFree(workj_L1));
1107   PetscCall(PetscFree(workj_L2));
1108   PetscCall(PetscFree(workj_L3));
1109   PetscFunctionReturn(PETSC_SUCCESS);
1110 }
1111 
1112 /* concatenate unique entries and then sort */
1113 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1114 {
1115   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1116   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1117   PetscInt       *ci, *cj, bcol;
1118   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1119   PetscReal       afill;
1120   PetscInt        i, j, ndouble = 0;
1121   PetscSegBuffer  seg, segrow;
1122   char           *seen;
1123 
1124   PetscFunctionBegin;
1125   PetscCall(PetscMalloc1(am + 1, &ci));
1126   ci[0] = 0;
1127 
1128   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1129   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1130   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1131   PetscCall(PetscCalloc1(bn, &seen));
1132 
1133   /* Determine ci and cj */
1134   for (i = 0; i < am; i++) {
1135     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1136     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1137     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
1138 
1139     /* Pack segrow */
1140     for (j = 0; j < anzi; j++) {
1141       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1142       for (k = bjstart; k < bjend; k++) {
1143         bcol = bj[k];
1144         if (!seen[bcol]) { /* new entry */
1145           PetscInt *PETSC_RESTRICT slot;
1146           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1147           *slot      = bcol;
1148           seen[bcol] = 1;
1149           packlen++;
1150         }
1151       }
1152     }
1153 
1154     /* Check i-th diagonal entry */
1155     if (C->force_diagonals && !seen[i]) {
1156       PetscInt *PETSC_RESTRICT slot;
1157       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1158       *slot   = i;
1159       seen[i] = 1;
1160       packlen++;
1161     }
1162 
1163     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1164     PetscCall(PetscSegBufferExtractTo(segrow, crow));
1165     PetscCall(PetscSortInt(packlen, crow));
1166     ci[i + 1] = ci[i] + packlen;
1167     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1168   }
1169   PetscCall(PetscSegBufferDestroy(&segrow));
1170   PetscCall(PetscFree(seen));
1171 
1172   /* Column indices are in the segmented buffer */
1173   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1174   PetscCall(PetscSegBufferDestroy(&seg));
1175 
1176   /* put together the new symbolic matrix */
1177   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1178   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1179 
1180   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1181   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1182   c          = (Mat_SeqAIJ *)C->data;
1183   c->free_a  = PETSC_TRUE;
1184   c->free_ij = PETSC_TRUE;
1185   c->nonew   = 0;
1186 
1187   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1188 
1189   /* set MatInfo */
1190   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1191   if (afill < 1.0) afill = 1.0;
1192   C->info.mallocs           = ndouble;
1193   C->info.fill_ratio_given  = fill;
1194   C->info.fill_ratio_needed = afill;
1195 
1196 #if defined(PETSC_USE_INFO)
1197   if (ci[am]) {
1198     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1199     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1200   } else {
1201     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1202   }
1203 #endif
1204   PetscFunctionReturn(PETSC_SUCCESS);
1205 }
1206 
1207 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1208 {
1209   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
1210 
1211   PetscFunctionBegin;
1212   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1213   PetscCall(MatDestroy(&abt->Bt_den));
1214   PetscCall(MatDestroy(&abt->ABt_den));
1215   PetscCall(PetscFree(abt));
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1220 {
1221   Mat                  Bt;
1222   Mat_MatMatTransMult *abt;
1223   Mat_Product         *product = C->product;
1224   char                *alg;
1225 
1226   PetscFunctionBegin;
1227   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1228   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1229 
1230   /* create symbolic Bt */
1231   PetscCall(MatTransposeSymbolic(B, &Bt));
1232   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1233   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
1234 
1235   /* get symbolic C=A*Bt */
1236   PetscCall(PetscStrallocpy(product->alg, &alg));
1237   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1238   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1239   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1240   PetscCall(PetscFree(alg));
1241 
1242   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1243   PetscCall(PetscNew(&abt));
1244 
1245   product->data    = abt;
1246   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1247 
1248   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1249 
1250   abt->usecoloring = PETSC_FALSE;
1251   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1252   if (abt->usecoloring) {
1253     /* Create MatTransposeColoring from symbolic C=A*B^T */
1254     MatTransposeColoring matcoloring;
1255     MatColoring          coloring;
1256     ISColoring           iscoloring;
1257     Mat                  Bt_dense, C_dense;
1258 
1259     /* inode causes memory problem */
1260     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
1261 
1262     PetscCall(MatColoringCreate(C, &coloring));
1263     PetscCall(MatColoringSetDistance(coloring, 2));
1264     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1265     PetscCall(MatColoringSetFromOptions(coloring));
1266     PetscCall(MatColoringApply(coloring, &iscoloring));
1267     PetscCall(MatColoringDestroy(&coloring));
1268     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
1269 
1270     abt->matcoloring = matcoloring;
1271 
1272     PetscCall(ISColoringDestroy(&iscoloring));
1273 
1274     /* Create Bt_dense and C_dense = A*Bt_dense */
1275     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1276     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1277     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1278     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
1279 
1280     Bt_dense->assembled = PETSC_TRUE;
1281     abt->Bt_den         = Bt_dense;
1282 
1283     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1284     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1285     PetscCall(MatSetType(C_dense, MATSEQDENSE));
1286     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
1287 
1288     Bt_dense->assembled = PETSC_TRUE;
1289     abt->ABt_den        = C_dense;
1290 
1291 #if defined(PETSC_USE_INFO)
1292     {
1293       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1294       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1295                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1296     }
1297 #endif
1298   }
1299   /* clean up */
1300   PetscCall(MatDestroy(&Bt));
1301   PetscFunctionReturn(PETSC_SUCCESS);
1302 }
1303 
1304 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1305 {
1306   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1307   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1308   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1309   PetscLogDouble       flops = 0.0;
1310   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1311   Mat_MatMatTransMult *abt;
1312   Mat_Product         *product = C->product;
1313 
1314   PetscFunctionBegin;
1315   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1316   abt = (Mat_MatMatTransMult *)product->data;
1317   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1318   /* clear old values in C */
1319   if (!c->a) {
1320     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1321     c->a      = ca;
1322     c->free_a = PETSC_TRUE;
1323   } else {
1324     ca = c->a;
1325     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1326   }
1327 
1328   if (abt->usecoloring) {
1329     MatTransposeColoring matcoloring = abt->matcoloring;
1330     Mat                  Bt_dense, C_dense = abt->ABt_den;
1331 
1332     /* Get Bt_dense by Apply MatTransposeColoring to B */
1333     Bt_dense = abt->Bt_den;
1334     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1335 
1336     /* C_dense = A*Bt_dense */
1337     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1338 
1339     /* Recover C from C_dense */
1340     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1341     PetscFunctionReturn(PETSC_SUCCESS);
1342   }
1343 
1344   for (i = 0; i < cm; i++) {
1345     anzi = ai[i + 1] - ai[i];
1346     acol = PetscSafePointerPlusOffset(aj, ai[i]);
1347     aval = PetscSafePointerPlusOffset(aa, ai[i]);
1348     cnzi = ci[i + 1] - ci[i];
1349     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
1350     cval = ca + ci[i];
1351     for (j = 0; j < cnzi; j++) {
1352       brow = ccol[j];
1353       bnzj = bi[brow + 1] - bi[brow];
1354       bcol = bj + bi[brow];
1355       bval = ba + bi[brow];
1356 
1357       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1358       nexta = 0;
1359       nextb = 0;
1360       while (nexta < anzi && nextb < bnzj) {
1361         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1362         if (nexta == anzi) break;
1363         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1364         if (nextb == bnzj) break;
1365         if (acol[nexta] == bcol[nextb]) {
1366           cval[j] += aval[nexta] * bval[nextb];
1367           nexta++;
1368           nextb++;
1369           flops += 2;
1370         }
1371       }
1372     }
1373   }
1374   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1375   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1376   PetscCall(PetscLogFlops(flops));
1377   PetscFunctionReturn(PETSC_SUCCESS);
1378 }
1379 
1380 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1381 {
1382   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
1383 
1384   PetscFunctionBegin;
1385   PetscCall(MatDestroy(&atb->At));
1386   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1387   PetscCall(PetscFree(atb));
1388   PetscFunctionReturn(PETSC_SUCCESS);
1389 }
1390 
1391 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1392 {
1393   Mat          At      = NULL;
1394   Mat_Product *product = C->product;
1395   PetscBool    flg, def, square;
1396 
1397   PetscFunctionBegin;
1398   MatCheckProduct(C, 4);
1399   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1400   /* outerproduct */
1401   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1402   if (flg) {
1403     /* create symbolic At */
1404     if (!square) {
1405       PetscCall(MatTransposeSymbolic(A, &At));
1406       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1407       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1408     }
1409     /* get symbolic C=At*B */
1410     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1411     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1412 
1413     /* clean up */
1414     if (!square) PetscCall(MatDestroy(&At));
1415 
1416     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1417     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1418     PetscFunctionReturn(PETSC_SUCCESS);
1419   }
1420 
1421   /* matmatmult */
1422   PetscCall(PetscStrcmp(product->alg, "default", &def));
1423   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1424   if (flg || def) {
1425     Mat_MatTransMatMult *atb;
1426 
1427     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1428     PetscCall(PetscNew(&atb));
1429     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1430     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1431     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1432     PetscCall(MatProductSetAlgorithm(C, "at*b"));
1433     product->data    = atb;
1434     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1435     atb->At          = At;
1436 
1437     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1438     PetscFunctionReturn(PETSC_SUCCESS);
1439   }
1440 
1441   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1442 }
1443 
1444 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1445 {
1446   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1447   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1448   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1449   PetscLogDouble flops = 0.0;
1450   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1451 
1452   PetscFunctionBegin;
1453   if (!c->a) {
1454     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1455 
1456     c->a      = ca;
1457     c->free_a = PETSC_TRUE;
1458   } else {
1459     ca = c->a;
1460     PetscCall(PetscArrayzero(ca, ci[cm]));
1461   }
1462 
1463   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1464   for (i = 0; i < am; i++) {
1465     bj   = b->j + bi[i];
1466     ba   = b->a + bi[i];
1467     bnzi = bi[i + 1] - bi[i];
1468     anzi = ai[i + 1] - ai[i];
1469     for (j = 0; j < anzi; j++) {
1470       nextb = 0;
1471       crow  = *aj++;
1472       cjj   = cj + ci[crow];
1473       caj   = ca + ci[crow];
1474       /* perform sparse axpy operation.  Note cjj includes bj. */
1475       for (k = 0; nextb < bnzi; k++) {
1476         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1477           caj[k] += (*aa) * (*(ba + nextb));
1478           nextb++;
1479         }
1480       }
1481       flops += 2 * bnzi;
1482       aa++;
1483     }
1484   }
1485 
1486   /* Assemble the final matrix and clean up */
1487   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1488   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1489   PetscCall(PetscLogFlops(flops));
1490   PetscFunctionReturn(PETSC_SUCCESS);
1491 }
1492 
1493 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1494 {
1495   PetscFunctionBegin;
1496   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1497   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1498   PetscFunctionReturn(PETSC_SUCCESS);
1499 }
1500 
1501 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1502 {
1503   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
1504   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1505   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1506   const PetscInt    *aj;
1507   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1508   PetscInt           clda;
1509   PetscInt           am4, bm4, col, i, j, n;
1510 
1511   PetscFunctionBegin;
1512   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1513   PetscCall(MatSeqAIJGetArrayRead(A, &av));
1514   if (add) {
1515     PetscCall(MatDenseGetArray(C, &c));
1516   } else {
1517     PetscCall(MatDenseGetArrayWrite(C, &c));
1518   }
1519   PetscCall(MatDenseGetArrayRead(B, &b));
1520   PetscCall(MatDenseGetLDA(B, &bm));
1521   PetscCall(MatDenseGetLDA(C, &clda));
1522   am4 = 4 * clda;
1523   bm4 = 4 * bm;
1524   if (b) {
1525     b1 = b;
1526     b2 = b1 + bm;
1527     b3 = b2 + bm;
1528     b4 = b3 + bm;
1529   } else b1 = b2 = b3 = b4 = NULL;
1530   c1 = c;
1531   c2 = c1 + clda;
1532   c3 = c2 + clda;
1533   c4 = c3 + clda;
1534   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1535     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1536       r1 = r2 = r3 = r4 = 0.0;
1537       n                 = a->i[i + 1] - a->i[i];
1538       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
1539       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1540       for (j = 0; j < n; j++) {
1541         const PetscScalar aatmp = aa[j];
1542         const PetscInt    ajtmp = aj[j];
1543         r1 += aatmp * b1[ajtmp];
1544         r2 += aatmp * b2[ajtmp];
1545         r3 += aatmp * b3[ajtmp];
1546         r4 += aatmp * b4[ajtmp];
1547       }
1548       if (add) {
1549         c1[i] += r1;
1550         c2[i] += r2;
1551         c3[i] += r3;
1552         c4[i] += r4;
1553       } else {
1554         c1[i] = r1;
1555         c2[i] = r2;
1556         c3[i] = r3;
1557         c4[i] = r4;
1558       }
1559     }
1560     if (b) {
1561       b1 += bm4;
1562       b2 += bm4;
1563       b3 += bm4;
1564       b4 += bm4;
1565     }
1566     c1 += am4;
1567     c2 += am4;
1568     c3 += am4;
1569     c4 += am4;
1570   }
1571   /* process remaining columns */
1572   if (col != cn) {
1573     PetscInt rc = cn - col;
1574 
1575     if (rc == 1) {
1576       for (i = 0; i < am; i++) {
1577         r1 = 0.0;
1578         n  = a->i[i + 1] - a->i[i];
1579         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1580         aa = PetscSafePointerPlusOffset(av, a->i[i]);
1581         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1582         if (add) c1[i] += r1;
1583         else c1[i] = r1;
1584       }
1585     } else if (rc == 2) {
1586       for (i = 0; i < am; i++) {
1587         r1 = r2 = 0.0;
1588         n       = a->i[i + 1] - a->i[i];
1589         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
1590         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1591         for (j = 0; j < n; j++) {
1592           const PetscScalar aatmp = aa[j];
1593           const PetscInt    ajtmp = aj[j];
1594           r1 += aatmp * b1[ajtmp];
1595           r2 += aatmp * b2[ajtmp];
1596         }
1597         if (add) {
1598           c1[i] += r1;
1599           c2[i] += r2;
1600         } else {
1601           c1[i] = r1;
1602           c2[i] = r2;
1603         }
1604       }
1605     } else {
1606       for (i = 0; i < am; i++) {
1607         r1 = r2 = r3 = 0.0;
1608         n            = a->i[i + 1] - a->i[i];
1609         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
1610         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
1611         for (j = 0; j < n; j++) {
1612           const PetscScalar aatmp = aa[j];
1613           const PetscInt    ajtmp = aj[j];
1614           r1 += aatmp * b1[ajtmp];
1615           r2 += aatmp * b2[ajtmp];
1616           r3 += aatmp * b3[ajtmp];
1617         }
1618         if (add) {
1619           c1[i] += r1;
1620           c2[i] += r2;
1621           c3[i] += r3;
1622         } else {
1623           c1[i] = r1;
1624           c2[i] = r2;
1625           c3[i] = r3;
1626         }
1627       }
1628     }
1629   }
1630   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1631   if (add) {
1632     PetscCall(MatDenseRestoreArray(C, &c));
1633   } else {
1634     PetscCall(MatDenseRestoreArrayWrite(C, &c));
1635   }
1636   PetscCall(MatDenseRestoreArrayRead(B, &b));
1637   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1642 {
1643   PetscFunctionBegin;
1644   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1645   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1646   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1647 
1648   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1649   PetscFunctionReturn(PETSC_SUCCESS);
1650 }
1651 
1652 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1653 {
1654   PetscFunctionBegin;
1655   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1656   C->ops->productsymbolic = MatProductSymbolic_AB;
1657   PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659 
1660 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
1661 
1662 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1663 {
1664   PetscFunctionBegin;
1665   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1666   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1667   PetscFunctionReturn(PETSC_SUCCESS);
1668 }
1669 
1670 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1671 {
1672   PetscFunctionBegin;
1673   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1674   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1675   PetscFunctionReturn(PETSC_SUCCESS);
1676 }
1677 
1678 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1679 {
1680   Mat_Product *product = C->product;
1681 
1682   PetscFunctionBegin;
1683   switch (product->type) {
1684   case MATPRODUCT_AB:
1685     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1686     break;
1687   case MATPRODUCT_AtB:
1688     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1689     break;
1690   case MATPRODUCT_ABt:
1691     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1692     break;
1693   default:
1694     break;
1695   }
1696   PetscFunctionReturn(PETSC_SUCCESS);
1697 }
1698 
1699 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1700 {
1701   Mat_Product *product = C->product;
1702   Mat          A       = product->A;
1703   PetscBool    baij;
1704 
1705   PetscFunctionBegin;
1706   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1707   if (!baij) { /* A is seqsbaij */
1708     PetscBool sbaij;
1709     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1710     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
1711 
1712     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1713   } else { /* A is seqbaij */
1714     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1715   }
1716 
1717   C->ops->productsymbolic = MatProductSymbolic_AB;
1718   PetscFunctionReturn(PETSC_SUCCESS);
1719 }
1720 
1721 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1722 {
1723   Mat_Product *product = C->product;
1724 
1725   PetscFunctionBegin;
1726   MatCheckProduct(C, 1);
1727   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1728   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1729   PetscFunctionReturn(PETSC_SUCCESS);
1730 }
1731 
1732 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1733 {
1734   PetscFunctionBegin;
1735   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1736   C->ops->productsymbolic = MatProductSymbolic_AB;
1737   PetscFunctionReturn(PETSC_SUCCESS);
1738 }
1739 
1740 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1741 {
1742   Mat_Product *product = C->product;
1743 
1744   PetscFunctionBegin;
1745   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1746   PetscFunctionReturn(PETSC_SUCCESS);
1747 }
1748 
1749 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1750 {
1751   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
1752   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1753   PetscInt     *bi = b->i, *bj = b->j;
1754   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1755   MatScalar    *btval, *btval_den, *ba = b->a;
1756   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1757 
1758   PetscFunctionBegin;
1759   btval_den = btdense->v;
1760   PetscCall(PetscArrayzero(btval_den, m * n));
1761   for (k = 0; k < ncolors; k++) {
1762     ncolumns = coloring->ncolumns[k];
1763     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1764       col   = *(columns + colorforcol[k] + l);
1765       btcol = bj + bi[col];
1766       btval = ba + bi[col];
1767       anz   = bi[col + 1] - bi[col];
1768       for (j = 0; j < anz; j++) {
1769         brow            = btcol[j];
1770         btval_den[brow] = btval[j];
1771       }
1772     }
1773     btval_den += m;
1774   }
1775   PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777 
1778 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1779 {
1780   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
1781   const PetscScalar *ca_den, *ca_den_ptr;
1782   PetscScalar       *ca = csp->a;
1783   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1784   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1785   PetscInt           nrows, *row, *idx;
1786   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
1787 
1788   PetscFunctionBegin;
1789   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1790 
1791   if (brows > 0) {
1792     PetscInt *lstart, row_end, row_start;
1793     lstart = matcoloring->lstart;
1794     PetscCall(PetscArrayzero(lstart, ncolors));
1795 
1796     row_end = brows;
1797     if (row_end > m) row_end = m;
1798     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1799       ca_den_ptr = ca_den;
1800       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1801         nrows = matcoloring->nrows[k];
1802         row   = rows + colorforrow[k];
1803         idx   = den2sp + colorforrow[k];
1804         for (l = lstart[k]; l < nrows; l++) {
1805           if (row[l] >= row_end) {
1806             lstart[k] = l;
1807             break;
1808           } else {
1809             ca[idx[l]] = ca_den_ptr[row[l]];
1810           }
1811         }
1812         ca_den_ptr += m;
1813       }
1814       row_end += brows;
1815       if (row_end > m) row_end = m;
1816     }
1817   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1818     ca_den_ptr = ca_den;
1819     for (k = 0; k < ncolors; k++) {
1820       nrows = matcoloring->nrows[k];
1821       row   = rows + colorforrow[k];
1822       idx   = den2sp + colorforrow[k];
1823       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1824       ca_den_ptr += m;
1825     }
1826   }
1827 
1828   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1829 #if defined(PETSC_USE_INFO)
1830   if (matcoloring->brows > 0) {
1831     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1832   } else {
1833     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1834   }
1835 #endif
1836   PetscFunctionReturn(PETSC_SUCCESS);
1837 }
1838 
1839 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1840 {
1841   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1842   const PetscInt *is, *ci, *cj, *row_idx;
1843   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1844   IS             *isa;
1845   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1846   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1847   PetscInt       *colorforcol, *columns, *columns_i, brows;
1848   PetscBool       flg;
1849 
1850   PetscFunctionBegin;
1851   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1852 
1853   /* bs >1 is not being tested yet! */
1854   Nbs       = mat->cmap->N / bs;
1855   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1856   c->N      = Nbs;
1857   c->m      = c->M;
1858   c->rstart = 0;
1859   c->brows  = 100;
1860 
1861   c->ncolors = nis;
1862   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1863   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1864   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1865 
1866   brows = c->brows;
1867   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1868   if (flg) c->brows = brows;
1869   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
1870 
1871   colorforrow[0] = 0;
1872   rows_i         = rows;
1873   den2sp_i       = den2sp;
1874 
1875   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1876   PetscCall(PetscMalloc1(Nbs + 1, &columns));
1877 
1878   colorforcol[0] = 0;
1879   columns_i      = columns;
1880 
1881   /* get column-wise storage of mat */
1882   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1883 
1884   cm = c->m;
1885   PetscCall(PetscMalloc1(cm + 1, &rowhit));
1886   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1887   for (i = 0; i < nis; i++) { /* loop over color */
1888     PetscCall(ISGetLocalSize(isa[i], &n));
1889     PetscCall(ISGetIndices(isa[i], &is));
1890 
1891     c->ncolumns[i] = n;
1892     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1893     colorforcol[i + 1] = colorforcol[i] + n;
1894     columns_i += n;
1895 
1896     /* fast, crude version requires O(N*N) work */
1897     PetscCall(PetscArrayzero(rowhit, cm));
1898 
1899     for (j = 0; j < n; j++) { /* loop over columns*/
1900       col     = is[j];
1901       row_idx = cj + ci[col];
1902       m       = ci[col + 1] - ci[col];
1903       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1904         idxhit[*row_idx]   = spidx[ci[col] + k];
1905         rowhit[*row_idx++] = col + 1;
1906       }
1907     }
1908     /* count the number of hits */
1909     nrows = 0;
1910     for (j = 0; j < cm; j++) {
1911       if (rowhit[j]) nrows++;
1912     }
1913     c->nrows[i]        = nrows;
1914     colorforrow[i + 1] = colorforrow[i] + nrows;
1915 
1916     nrows = 0;
1917     for (j = 0; j < cm; j++) { /* loop over rows */
1918       if (rowhit[j]) {
1919         rows_i[nrows]   = j;
1920         den2sp_i[nrows] = idxhit[j];
1921         nrows++;
1922       }
1923     }
1924     den2sp_i += nrows;
1925 
1926     PetscCall(ISRestoreIndices(isa[i], &is));
1927     rows_i += nrows;
1928   }
1929   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1930   PetscCall(PetscFree(rowhit));
1931   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1932   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1933 
1934   c->colorforrow = colorforrow;
1935   c->rows        = rows;
1936   c->den2sp      = den2sp;
1937   c->colorforcol = colorforcol;
1938   c->columns     = columns;
1939 
1940   PetscCall(PetscFree(idxhit));
1941   PetscFunctionReturn(PETSC_SUCCESS);
1942 }
1943 
1944 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1945 {
1946   Mat_Product *product = C->product;
1947   Mat          A = product->A, B = product->B;
1948 
1949   PetscFunctionBegin;
1950   if (C->ops->mattransposemultnumeric) {
1951     /* Alg: "outerproduct" */
1952     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1953   } else {
1954     /* Alg: "matmatmult" -- C = At*B */
1955     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1956 
1957     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1958     if (atb->At) {
1959       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1960          user may have called MatProductReplaceMats() to get this A=product->A */
1961       PetscCall(MatTransposeSetPrecursor(A, atb->At));
1962       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1963     }
1964     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1965   }
1966   PetscFunctionReturn(PETSC_SUCCESS);
1967 }
1968 
1969 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1970 {
1971   Mat_Product *product = C->product;
1972   Mat          A = product->A, B = product->B;
1973   PetscReal    fill = product->fill;
1974 
1975   PetscFunctionBegin;
1976   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
1977 
1978   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
1982 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1983 {
1984   Mat_Product *product = C->product;
1985   PetscInt     alg     = 0; /* default algorithm */
1986   PetscBool    flg     = PETSC_FALSE;
1987 #if !defined(PETSC_HAVE_HYPRE)
1988   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1989   PetscInt    nalg        = 7;
1990 #else
1991   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1992   PetscInt    nalg        = 8;
1993 #endif
1994 
1995   PetscFunctionBegin;
1996   /* Set default algorithm */
1997   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
1998   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1999 
2000   /* Get runtime option */
2001   if (product->api_user) {
2002     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
2003     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2004     PetscOptionsEnd();
2005   } else {
2006     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2007     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2008     PetscOptionsEnd();
2009   }
2010   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2011 
2012   C->ops->productsymbolic = MatProductSymbolic_AB;
2013   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2014   PetscFunctionReturn(PETSC_SUCCESS);
2015 }
2016 
2017 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2018 {
2019   Mat_Product *product     = C->product;
2020   PetscInt     alg         = 0; /* default algorithm */
2021   PetscBool    flg         = PETSC_FALSE;
2022   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2023   PetscInt     nalg        = 3;
2024 
2025   PetscFunctionBegin;
2026   /* Get runtime option */
2027   if (product->api_user) {
2028     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2029     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2030     PetscOptionsEnd();
2031   } else {
2032     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2033     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2034     PetscOptionsEnd();
2035   }
2036   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2037 
2038   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2039   PetscFunctionReturn(PETSC_SUCCESS);
2040 }
2041 
2042 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2043 {
2044   Mat_Product *product     = C->product;
2045   PetscInt     alg         = 0; /* default algorithm */
2046   PetscBool    flg         = PETSC_FALSE;
2047   const char  *algTypes[2] = {"default", "color"};
2048   PetscInt     nalg        = 2;
2049 
2050   PetscFunctionBegin;
2051   /* Set default algorithm */
2052   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2053   if (!flg) {
2054     alg = 1;
2055     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2056   }
2057 
2058   /* Get runtime option */
2059   if (product->api_user) {
2060     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2061     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2062     PetscOptionsEnd();
2063   } else {
2064     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2065     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2066     PetscOptionsEnd();
2067   }
2068   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2069 
2070   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2071   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2072   PetscFunctionReturn(PETSC_SUCCESS);
2073 }
2074 
2075 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2076 {
2077   Mat_Product *product = C->product;
2078   PetscBool    flg     = PETSC_FALSE;
2079   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
2080 #if !defined(PETSC_HAVE_HYPRE)
2081   const char *algTypes[2] = {"scalable", "rap"};
2082   PetscInt    nalg        = 2;
2083 #else
2084   const char *algTypes[3] = {"scalable", "rap", "hypre"};
2085   PetscInt    nalg        = 3;
2086 #endif
2087 
2088   PetscFunctionBegin;
2089   /* Set default algorithm */
2090   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2091   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2092 
2093   /* Get runtime option */
2094   if (product->api_user) {
2095     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2096     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2097     PetscOptionsEnd();
2098   } else {
2099     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2100     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2101     PetscOptionsEnd();
2102   }
2103   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2104 
2105   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2106   PetscFunctionReturn(PETSC_SUCCESS);
2107 }
2108 
2109 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2110 {
2111   Mat_Product *product     = C->product;
2112   PetscBool    flg         = PETSC_FALSE;
2113   PetscInt     alg         = 0; /* default algorithm */
2114   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2115   PetscInt     nalg        = 3;
2116 
2117   PetscFunctionBegin;
2118   /* Set default algorithm */
2119   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2120   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2121 
2122   /* Get runtime option */
2123   if (product->api_user) {
2124     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2125     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2126     PetscOptionsEnd();
2127   } else {
2128     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2129     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2130     PetscOptionsEnd();
2131   }
2132   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2133 
2134   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2135   PetscFunctionReturn(PETSC_SUCCESS);
2136 }
2137 
2138 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2139 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2140 {
2141   Mat_Product *product     = C->product;
2142   PetscInt     alg         = 0; /* default algorithm */
2143   PetscBool    flg         = PETSC_FALSE;
2144   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2145   PetscInt     nalg        = 7;
2146 
2147   PetscFunctionBegin;
2148   /* Set default algorithm */
2149   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2150   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2151 
2152   /* Get runtime option */
2153   if (product->api_user) {
2154     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2155     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2156     PetscOptionsEnd();
2157   } else {
2158     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2159     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2160     PetscOptionsEnd();
2161   }
2162   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2163 
2164   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2165   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2166   PetscFunctionReturn(PETSC_SUCCESS);
2167 }
2168 
2169 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2170 {
2171   Mat_Product *product = C->product;
2172 
2173   PetscFunctionBegin;
2174   switch (product->type) {
2175   case MATPRODUCT_AB:
2176     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2177     break;
2178   case MATPRODUCT_AtB:
2179     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2180     break;
2181   case MATPRODUCT_ABt:
2182     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2183     break;
2184   case MATPRODUCT_PtAP:
2185     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2186     break;
2187   case MATPRODUCT_RARt:
2188     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2189     break;
2190   case MATPRODUCT_ABC:
2191     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2192     break;
2193   default:
2194     break;
2195   }
2196   PetscFunctionReturn(PETSC_SUCCESS);
2197 }
2198