xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 89669be4d29968dc8d4c19ce1b69194a6a561ea4)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) {
17     PetscCheck(C->ops->matmultnumeric != MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
18     PetscCall((*C->ops->matmultnumeric)(A,B,C));
19   } else {
20     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
21   }
22   PetscFunctionReturn(0);
23 }
24 
25 /* Modified from MatCreateSeqAIJWithArrays() */
26 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
27 {
28   PetscInt       ii;
29   Mat_SeqAIJ     *aij;
30   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
31 
32   PetscFunctionBegin;
33   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
34   PetscCall(MatSetSizes(mat,m,n,m,n));
35 
36   if (!mtype) {
37     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
38     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
39   } else {
40     PetscCall(MatSetType(mat,mtype));
41   }
42 
43   aij  = (Mat_SeqAIJ*)(mat)->data;
44   osingle = aij->singlemalloc;
45   ofree_a = aij->free_a;
46   ofree_ij = aij->free_ij;
47   /* changes the free flags */
48   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
49 
50   PetscCall(PetscFree(aij->ilen));
51   PetscCall(PetscFree(aij->imax));
52   PetscCall(PetscMalloc1(m,&aij->imax));
53   PetscCall(PetscMalloc1(m,&aij->ilen));
54   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
55     const PetscInt rnz = i[ii+1] - i[ii];
56     aij->nonzerorowcnt += !!rnz;
57     aij->rmax = PetscMax(aij->rmax,rnz);
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60   aij->maxnz = i[m];
61   aij->nz = i[m];
62 
63   if (osingle) {
64     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
65   } else {
66     if (ofree_a)  PetscCall(PetscFree(aij->a));
67     if (ofree_ij) PetscCall(PetscFree(aij->j));
68     if (ofree_ij) PetscCall(PetscFree(aij->i));
69   }
70   aij->i            = i;
71   aij->j            = j;
72   aij->a            = a;
73   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
74   /* default to not retain ownership */
75   aij->singlemalloc = PETSC_FALSE;
76   aij->free_a       = PETSC_FALSE;
77   aij->free_ij      = PETSC_FALSE;
78   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
79   PetscFunctionReturn(0);
80 }
81 
82 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
83 {
84   Mat_Product         *product = C->product;
85   MatProductAlgorithm alg;
86   PetscBool           flg;
87 
88   PetscFunctionBegin;
89   if (product) {
90     alg = product->alg;
91   } else {
92     alg = "sorted";
93   }
94   /* sorted */
95   PetscCall(PetscStrcmp(alg,"sorted",&flg));
96   if (flg) {
97     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
98     PetscFunctionReturn(0);
99   }
100 
101   /* scalable */
102   PetscCall(PetscStrcmp(alg,"scalable",&flg));
103   if (flg) {
104     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
105     PetscFunctionReturn(0);
106   }
107 
108   /* scalable_fast */
109   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
110   if (flg) {
111     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
112     PetscFunctionReturn(0);
113   }
114 
115   /* heap */
116   PetscCall(PetscStrcmp(alg,"heap",&flg));
117   if (flg) {
118     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
119     PetscFunctionReturn(0);
120   }
121 
122   /* btheap */
123   PetscCall(PetscStrcmp(alg,"btheap",&flg));
124   if (flg) {
125     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
126     PetscFunctionReturn(0);
127   }
128 
129   /* llcondensed */
130   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
131   if (flg) {
132     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
133     PetscFunctionReturn(0);
134   }
135 
136   /* rowmerge */
137   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
138   if (flg) {
139     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
140     PetscFunctionReturn(0);
141   }
142 
143 #if defined(PETSC_HAVE_HYPRE)
144   PetscCall(PetscStrcmp(alg,"hypre",&flg));
145   if (flg) {
146     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
147     PetscFunctionReturn(0);
148   }
149 #endif
150 
151   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
152 }
153 
154 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
155 {
156   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
157   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
158   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
159   PetscReal          afill;
160   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
161   PetscTable         ta;
162   PetscBT            lnkbt;
163   PetscFreeSpaceList free_space=NULL,current_space=NULL;
164 
165   PetscFunctionBegin;
166   /* Get ci and cj */
167   /*---------------*/
168   /* Allocate ci array, arrays for fill computation and */
169   /* free space for accumulating nonzero column info */
170   PetscCall(PetscMalloc1(am+2,&ci));
171   ci[0] = 0;
172 
173   /* create and initialize a linked list */
174   PetscCall(PetscTableCreate(bn,bn,&ta));
175   MatRowMergeMax_SeqAIJ(b,bm,ta);
176   PetscCall(PetscTableGetCount(ta,&Crmax));
177   PetscCall(PetscTableDestroy(&ta));
178 
179   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
180 
181   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
182   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
183 
184   current_space = free_space;
185 
186   /* Determine ci and cj */
187   for (i=0; i<am; i++) {
188     anzi = ai[i+1] - ai[i];
189     aj   = a->j + ai[i];
190     for (j=0; j<anzi; j++) {
191       brow = aj[j];
192       bnzj = bi[brow+1] - bi[brow];
193       bj   = b->j + bi[brow];
194       /* add non-zero cols of B into the sorted linked list lnk */
195       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
196     }
197     /* add possible missing diagonal entry */
198     if (C->force_diagonals) {
199       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
200     }
201     cnzi = lnk[0];
202 
203     /* If free space is not available, make more free space */
204     /* Double the amount of total space in the list */
205     if (current_space->local_remaining<cnzi) {
206       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
207       ndouble++;
208     }
209 
210     /* Copy data into free space, then initialize lnk */
211     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
212 
213     current_space->array           += cnzi;
214     current_space->local_used      += cnzi;
215     current_space->local_remaining -= cnzi;
216 
217     ci[i+1] = ci[i] + cnzi;
218   }
219 
220   /* Column indices are in the list of free space */
221   /* Allocate space for cj, initialize cj, and */
222   /* destroy list of free space and other temporary array(s) */
223   PetscCall(PetscMalloc1(ci[am]+1,&cj));
224   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
225   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
226 
227   /* put together the new symbolic matrix */
228   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
229   PetscCall(MatSetBlockSizesFromMats(C,A,B));
230 
231   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
232   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
233   c          = (Mat_SeqAIJ*)(C->data);
234   c->free_a  = PETSC_FALSE;
235   c->free_ij = PETSC_TRUE;
236   c->nonew   = 0;
237 
238   /* fast, needs non-scalable O(bn) array 'abdense' */
239   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
240 
241   /* set MatInfo */
242   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
243   if (afill < 1.0) afill = 1.0;
244   C->info.mallocs           = ndouble;
245   C->info.fill_ratio_given  = fill;
246   C->info.fill_ratio_needed = afill;
247 
248 #if defined(PETSC_USE_INFO)
249   if (ci[am]) {
250     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
251     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
252   } else {
253     PetscCall(PetscInfo(C,"Empty matrix product\n"));
254   }
255 #endif
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
260 {
261   PetscLogDouble    flops=0.0;
262   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
263   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
264   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
265   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
266   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
267   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
268   PetscScalar       *ca,valtmp;
269   PetscScalar       *ab_dense;
270   PetscContainer    cab_dense;
271   const PetscScalar *aa,*ba,*baj;
272 
273   PetscFunctionBegin;
274   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
275   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
276   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
277     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
278     c->a      = ca;
279     c->free_a = PETSC_TRUE;
280   } else ca = c->a;
281 
282   /* TODO this should be done in the symbolic phase */
283   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
284      that is hard to eradicate) */
285   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
286   if (!cab_dense) {
287     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
288     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
289     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
290     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
291     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
292     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
293   }
294   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
295   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
296 
297   /* clean old values in C */
298   PetscCall(PetscArrayzero(ca,ci[cm]));
299   /* Traverse A row-wise. */
300   /* Build the ith row in C by summing over nonzero columns in A, */
301   /* the rows of B corresponding to nonzeros of A. */
302   for (i=0; i<am; i++) {
303     anzi = ai[i+1] - ai[i];
304     for (j=0; j<anzi; j++) {
305       brow = aj[j];
306       bnzi = bi[brow+1] - bi[brow];
307       bjj  = bj + bi[brow];
308       baj  = ba + bi[brow];
309       /* perform dense axpy */
310       valtmp = aa[j];
311       for (k=0; k<bnzi; k++) {
312         ab_dense[bjj[k]] += valtmp*baj[k];
313       }
314       flops += 2*bnzi;
315     }
316     aj += anzi; aa += anzi;
317 
318     cnzi = ci[i+1] - ci[i];
319     for (k=0; k<cnzi; k++) {
320       ca[k]          += ab_dense[cj[k]];
321       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
322     }
323     flops += cnzi;
324     cj    += cnzi; ca += cnzi;
325   }
326 #if defined(PETSC_HAVE_DEVICE)
327   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
328 #endif
329   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
330   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
331   PetscCall(PetscLogFlops(flops));
332   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
333   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
334   PetscFunctionReturn(0);
335 }
336 
337 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
338 {
339   PetscLogDouble    flops=0.0;
340   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
341   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
342   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
343   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
344   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
345   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
346   PetscScalar       *ca=c->a,valtmp;
347   const PetscScalar *aa,*ba,*baj;
348   PetscInt          nextb;
349 
350   PetscFunctionBegin;
351   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
352   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
353   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
354     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
355     c->a      = ca;
356     c->free_a = PETSC_TRUE;
357   }
358 
359   /* clean old values in C */
360   PetscCall(PetscArrayzero(ca,ci[cm]));
361   /* Traverse A row-wise. */
362   /* Build the ith row in C by summing over nonzero columns in A, */
363   /* the rows of B corresponding to nonzeros of A. */
364   for (i=0; i<am; i++) {
365     anzi = ai[i+1] - ai[i];
366     cnzi = ci[i+1] - ci[i];
367     for (j=0; j<anzi; j++) {
368       brow = aj[j];
369       bnzi = bi[brow+1] - bi[brow];
370       bjj  = bj + bi[brow];
371       baj  = ba + bi[brow];
372       /* perform sparse axpy */
373       valtmp = aa[j];
374       nextb  = 0;
375       for (k=0; nextb<bnzi; k++) {
376         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
377           ca[k] += valtmp*baj[nextb++];
378         }
379       }
380       flops += 2*bnzi;
381     }
382     aj += anzi; aa += anzi;
383     cj += cnzi; ca += cnzi;
384   }
385 #if defined(PETSC_HAVE_DEVICE)
386   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
387 #endif
388   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
389   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
390   PetscCall(PetscLogFlops(flops));
391   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
392   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
393   PetscFunctionReturn(0);
394 }
395 
396 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
397 {
398   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
399   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
400   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
401   MatScalar          *ca;
402   PetscReal          afill;
403   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
404   PetscTable         ta;
405   PetscFreeSpaceList free_space=NULL,current_space=NULL;
406 
407   PetscFunctionBegin;
408   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
409   /*-----------------------------------------------------------------------------------------*/
410   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
411   PetscCall(PetscMalloc1(am+2,&ci));
412   ci[0] = 0;
413 
414   /* create and initialize a linked list */
415   PetscCall(PetscTableCreate(bn,bn,&ta));
416   MatRowMergeMax_SeqAIJ(b,bm,ta);
417   PetscCall(PetscTableGetCount(ta,&Crmax));
418   PetscCall(PetscTableDestroy(&ta));
419 
420   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
421 
422   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
423   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
424   current_space = free_space;
425 
426   /* Determine ci and cj */
427   for (i=0; i<am; i++) {
428     anzi = ai[i+1] - ai[i];
429     aj   = a->j + ai[i];
430     for (j=0; j<anzi; j++) {
431       brow = aj[j];
432       bnzj = bi[brow+1] - bi[brow];
433       bj   = b->j + bi[brow];
434       /* add non-zero cols of B into the sorted linked list lnk */
435       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
436     }
437     /* add possible missing diagonal entry */
438     if (C->force_diagonals) {
439       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
440     }
441     cnzi = lnk[1];
442 
443     /* If free space is not available, make more free space */
444     /* Double the amount of total space in the list */
445     if (current_space->local_remaining<cnzi) {
446       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
447       ndouble++;
448     }
449 
450     /* Copy data into free space, then initialize lnk */
451     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
452 
453     current_space->array           += cnzi;
454     current_space->local_used      += cnzi;
455     current_space->local_remaining -= cnzi;
456 
457     ci[i+1] = ci[i] + cnzi;
458   }
459 
460   /* Column indices are in the list of free space */
461   /* Allocate space for cj, initialize cj, and */
462   /* destroy list of free space and other temporary array(s) */
463   PetscCall(PetscMalloc1(ci[am]+1,&cj));
464   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
465   PetscCall(PetscLLCondensedDestroy_fast(lnk));
466 
467   /* Allocate space for ca */
468   PetscCall(PetscCalloc1(ci[am]+1,&ca));
469 
470   /* put together the new symbolic matrix */
471   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
472   PetscCall(MatSetBlockSizesFromMats(C,A,B));
473 
474   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
475   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
476   c          = (Mat_SeqAIJ*)(C->data);
477   c->free_a  = PETSC_TRUE;
478   c->free_ij = PETSC_TRUE;
479   c->nonew   = 0;
480 
481   /* slower, less memory */
482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
483 
484   /* set MatInfo */
485   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
486   if (afill < 1.0) afill = 1.0;
487   C->info.mallocs           = ndouble;
488   C->info.fill_ratio_given  = fill;
489   C->info.fill_ratio_needed = afill;
490 
491 #if defined(PETSC_USE_INFO)
492   if (ci[am]) {
493     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
494     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
495   } else {
496     PetscCall(PetscInfo(C,"Empty matrix product\n"));
497   }
498 #endif
499   PetscFunctionReturn(0);
500 }
501 
502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
503 {
504   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
505   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
506   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
507   MatScalar          *ca;
508   PetscReal          afill;
509   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
510   PetscTable         ta;
511   PetscFreeSpaceList free_space=NULL,current_space=NULL;
512 
513   PetscFunctionBegin;
514   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
515   /*---------------------------------------------------------------------------------------------*/
516   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
517   PetscCall(PetscMalloc1(am+2,&ci));
518   ci[0] = 0;
519 
520   /* create and initialize a linked list */
521   PetscCall(PetscTableCreate(bn,bn,&ta));
522   MatRowMergeMax_SeqAIJ(b,bm,ta);
523   PetscCall(PetscTableGetCount(ta,&Crmax));
524   PetscCall(PetscTableDestroy(&ta));
525   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
526 
527   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
528   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
529   current_space = free_space;
530 
531   /* Determine ci and cj */
532   for (i=0; i<am; i++) {
533     anzi = ai[i+1] - ai[i];
534     aj   = a->j + ai[i];
535     for (j=0; j<anzi; j++) {
536       brow = aj[j];
537       bnzj = bi[brow+1] - bi[brow];
538       bj   = b->j + bi[brow];
539       /* add non-zero cols of B into the sorted linked list lnk */
540       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
541     }
542     /* add possible missing diagonal entry */
543     if (C->force_diagonals) {
544       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
545     }
546 
547     cnzi = lnk[0];
548 
549     /* If free space is not available, make more free space */
550     /* Double the amount of total space in the list */
551     if (current_space->local_remaining<cnzi) {
552       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
553       ndouble++;
554     }
555 
556     /* Copy data into free space, then initialize lnk */
557     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
558 
559     current_space->array           += cnzi;
560     current_space->local_used      += cnzi;
561     current_space->local_remaining -= cnzi;
562 
563     ci[i+1] = ci[i] + cnzi;
564   }
565 
566   /* Column indices are in the list of free space */
567   /* Allocate space for cj, initialize cj, and */
568   /* destroy list of free space and other temporary array(s) */
569   PetscCall(PetscMalloc1(ci[am]+1,&cj));
570   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
571   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
572 
573   /* Allocate space for ca */
574   /*-----------------------*/
575   PetscCall(PetscCalloc1(ci[am]+1,&ca));
576 
577   /* put together the new symbolic matrix */
578   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
579   PetscCall(MatSetBlockSizesFromMats(C,A,B));
580 
581   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
582   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
583   c          = (Mat_SeqAIJ*)(C->data);
584   c->free_a  = PETSC_TRUE;
585   c->free_ij = PETSC_TRUE;
586   c->nonew   = 0;
587 
588   /* slower, less memory */
589   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
590 
591   /* set MatInfo */
592   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
593   if (afill < 1.0) afill = 1.0;
594   C->info.mallocs           = ndouble;
595   C->info.fill_ratio_given  = fill;
596   C->info.fill_ratio_needed = afill;
597 
598 #if defined(PETSC_USE_INFO)
599   if (ci[am]) {
600     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
601     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
602   } else {
603     PetscCall(PetscInfo(C,"Empty matrix product\n"));
604   }
605 #endif
606   PetscFunctionReturn(0);
607 }
608 
609 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
610 {
611   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
612   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
613   PetscInt           *ci,*cj,*bb;
614   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
615   PetscReal          afill;
616   PetscInt           i,j,col,ndouble = 0;
617   PetscFreeSpaceList free_space=NULL,current_space=NULL;
618   PetscHeap          h;
619 
620   PetscFunctionBegin;
621   /* Get ci and cj - by merging sorted rows using a heap */
622   /*---------------------------------------------------------------------------------------------*/
623   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
624   PetscCall(PetscMalloc1(am+2,&ci));
625   ci[0] = 0;
626 
627   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
628   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
629   current_space = free_space;
630 
631   PetscCall(PetscHeapCreate(a->rmax,&h));
632   PetscCall(PetscMalloc1(a->rmax,&bb));
633 
634   /* Determine ci and cj */
635   for (i=0; i<am; i++) {
636     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
637     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
638     ci[i+1] = ci[i];
639     /* Populate the min heap */
640     for (j=0; j<anzi; j++) {
641       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
642       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
643         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
644       }
645     }
646     /* Pick off the min element, adding it to free space */
647     PetscCall(PetscHeapPop(h,&j,&col));
648     while (j >= 0) {
649       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
650         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
651         ndouble++;
652       }
653       *(current_space->array++) = col;
654       current_space->local_used++;
655       current_space->local_remaining--;
656       ci[i+1]++;
657 
658       /* stash if anything else remains in this row of B */
659       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
660       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
661         PetscInt j2,col2;
662         PetscCall(PetscHeapPeek(h,&j2,&col2));
663         if (col2 != col) break;
664         PetscCall(PetscHeapPop(h,&j2,&col2));
665         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
666       }
667       /* Put any stashed elements back into the min heap */
668       PetscCall(PetscHeapUnstash(h));
669       PetscCall(PetscHeapPop(h,&j,&col));
670     }
671   }
672   PetscCall(PetscFree(bb));
673   PetscCall(PetscHeapDestroy(&h));
674 
675   /* Column indices are in the list of free space */
676   /* Allocate space for cj, initialize cj, and */
677   /* destroy list of free space and other temporary array(s) */
678   PetscCall(PetscMalloc1(ci[am],&cj));
679   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
680 
681   /* put together the new symbolic matrix */
682   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
683   PetscCall(MatSetBlockSizesFromMats(C,A,B));
684 
685   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687   c          = (Mat_SeqAIJ*)(C->data);
688   c->free_a  = PETSC_TRUE;
689   c->free_ij = PETSC_TRUE;
690   c->nonew   = 0;
691 
692   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
693 
694   /* set MatInfo */
695   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
696   if (afill < 1.0) afill = 1.0;
697   C->info.mallocs           = ndouble;
698   C->info.fill_ratio_given  = fill;
699   C->info.fill_ratio_needed = afill;
700 
701 #if defined(PETSC_USE_INFO)
702   if (ci[am]) {
703     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
704     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
705   } else {
706     PetscCall(PetscInfo(C,"Empty matrix product\n"));
707   }
708 #endif
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
713 {
714   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
715   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
716   PetscInt           *ci,*cj,*bb;
717   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
718   PetscReal          afill;
719   PetscInt           i,j,col,ndouble = 0;
720   PetscFreeSpaceList free_space=NULL,current_space=NULL;
721   PetscHeap          h;
722   PetscBT            bt;
723 
724   PetscFunctionBegin;
725   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
726   /*---------------------------------------------------------------------------------------------*/
727   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
728   PetscCall(PetscMalloc1(am+2,&ci));
729   ci[0] = 0;
730 
731   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
732   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
733 
734   current_space = free_space;
735 
736   PetscCall(PetscHeapCreate(a->rmax,&h));
737   PetscCall(PetscMalloc1(a->rmax,&bb));
738   PetscCall(PetscBTCreate(bn,&bt));
739 
740   /* Determine ci and cj */
741   for (i=0; i<am; i++) {
742     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
743     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
744     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
745     ci[i+1] = ci[i];
746     /* Populate the min heap */
747     for (j=0; j<anzi; j++) {
748       PetscInt brow = acol[j];
749       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
750         PetscInt bcol = bj[bb[j]];
751         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
752           PetscCall(PetscHeapAdd(h,j,bcol));
753           bb[j]++;
754           break;
755         }
756       }
757     }
758     /* Pick off the min element, adding it to free space */
759     PetscCall(PetscHeapPop(h,&j,&col));
760     while (j >= 0) {
761       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
762         fptr = NULL;                      /* need PetscBTMemzero */
763         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
764         ndouble++;
765       }
766       *(current_space->array++) = col;
767       current_space->local_used++;
768       current_space->local_remaining--;
769       ci[i+1]++;
770 
771       /* stash if anything else remains in this row of B */
772       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
773         PetscInt bcol = bj[bb[j]];
774         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
775           PetscCall(PetscHeapAdd(h,j,bcol));
776           bb[j]++;
777           break;
778         }
779       }
780       PetscCall(PetscHeapPop(h,&j,&col));
781     }
782     if (fptr) {                 /* Clear the bits for this row */
783       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
784     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
785       PetscCall(PetscBTMemzero(bn,bt));
786     }
787   }
788   PetscCall(PetscFree(bb));
789   PetscCall(PetscHeapDestroy(&h));
790   PetscCall(PetscBTDestroy(&bt));
791 
792   /* Column indices are in the list of free space */
793   /* Allocate space for cj, initialize cj, and */
794   /* destroy list of free space and other temporary array(s) */
795   PetscCall(PetscMalloc1(ci[am],&cj));
796   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
797 
798   /* put together the new symbolic matrix */
799   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
800   PetscCall(MatSetBlockSizesFromMats(C,A,B));
801 
802   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
803   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
804   c          = (Mat_SeqAIJ*)(C->data);
805   c->free_a  = PETSC_TRUE;
806   c->free_ij = PETSC_TRUE;
807   c->nonew   = 0;
808 
809   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
810 
811   /* set MatInfo */
812   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
813   if (afill < 1.0) afill = 1.0;
814   C->info.mallocs           = ndouble;
815   C->info.fill_ratio_given  = fill;
816   C->info.fill_ratio_needed = afill;
817 
818 #if defined(PETSC_USE_INFO)
819   if (ci[am]) {
820     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
821     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
822   } else {
823     PetscCall(PetscInfo(C,"Empty matrix product\n"));
824   }
825 #endif
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
830 {
831   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
832   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
833   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
834   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
835   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
836   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
837   const PetscInt     *brow_ptr[8],*brow_end[8];
838   PetscInt           window[8];
839   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
840   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
841   PetscReal          afill;
842   PetscInt           *workj_L1,*workj_L2,*workj_L3;
843   PetscInt           L1_nnz,L2_nnz;
844 
845   /* Step 1: Get upper bound on memory required for allocation.
846              Because of the way virtual memory works,
847              only the memory pages that are actually needed will be physically allocated. */
848   PetscFunctionBegin;
849   PetscCall(PetscMalloc1(am+1,&ci));
850   for (i=0; i<am; i++) {
851     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
852     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
853     a_rownnz = 0;
854     for (k=0; k<anzi; ++k) {
855       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
856       if (a_rownnz > bn) {
857         a_rownnz = bn;
858         break;
859       }
860     }
861     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
862   }
863   /* temporary work areas for merging rows */
864   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
865   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
866   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
867 
868   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
869   c_maxmem = 8*(ai[am]+bi[bm]);
870   /* Step 2: Populate pattern for C */
871   PetscCall(PetscMalloc1(c_maxmem,&cj));
872 
873   ci_nnz       = 0;
874   ci[0]        = 0;
875   worki_L1[0]  = 0;
876   worki_L2[0]  = 0;
877   for (i=0; i<am; i++) {
878     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
879     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
880     rowsleft             = anzi;
881     inputcol_L1          = acol;
882     L2_nnz               = 0;
883     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
884     worki_L2[1]          = 0;
885     outputi_nnz          = 0;
886 
887     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
888     while (ci_nnz+a_maxrownnz > c_maxmem) {
889       c_maxmem *= 2;
890       ndouble++;
891       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
892     }
893 
894     while (rowsleft) {
895       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
896       L1_nrows    = 0;
897       L1_nnz      = 0;
898       inputcol    = inputcol_L1;
899       inputi      = bi;
900       inputj      = bj;
901 
902       /* The following macro is used to specialize for small rows in A.
903          This helps with compiler unrolling, improving performance substantially.
904           Input:  inputj   inputi  inputcol  bn
905           Output: outputj  outputi_nnz                       */
906        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
907          window_min  = bn;                                                   \
908          outputi_nnz = 0;                                                    \
909          for (k=0; k<ANNZ; ++k) {                                            \
910            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
911            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
912            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
913            window_min  = PetscMin(window[k], window_min);                    \
914          }                                                                   \
915          while (window_min < bn) {                                           \
916            outputj[outputi_nnz++] = window_min;                              \
917            /* advance front and compute new minimum */                       \
918            old_window_min = window_min;                                      \
919            window_min = bn;                                                  \
920            for (k=0; k<ANNZ; ++k) {                                          \
921              if (window[k] == old_window_min) {                              \
922                brow_ptr[k]++;                                                \
923                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
924              }                                                               \
925              window_min = PetscMin(window[k], window_min);                   \
926            }                                                                 \
927          }
928 
929       /************** L E V E L  1 ***************/
930       /* Merge up to 8 rows of B to L1 work array*/
931       while (L1_rowsleft) {
932         outputi_nnz = 0;
933         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
934         else           outputj = cj + ci_nnz;           /* Merge directly to C */
935 
936         switch (L1_rowsleft) {
937         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
938                  brow_end[0] = inputj + inputi[inputcol[0]+1];
939                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
940                  inputcol    += L1_rowsleft;
941                  rowsleft    -= L1_rowsleft;
942                  L1_rowsleft  = 0;
943                  break;
944         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
945                  inputcol    += L1_rowsleft;
946                  rowsleft    -= L1_rowsleft;
947                  L1_rowsleft  = 0;
948                  break;
949         case 3: MatMatMultSymbolic_RowMergeMacro(3);
950                  inputcol    += L1_rowsleft;
951                  rowsleft    -= L1_rowsleft;
952                  L1_rowsleft  = 0;
953                  break;
954         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
955                  inputcol    += L1_rowsleft;
956                  rowsleft    -= L1_rowsleft;
957                  L1_rowsleft  = 0;
958                  break;
959         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
960                  inputcol    += L1_rowsleft;
961                  rowsleft    -= L1_rowsleft;
962                  L1_rowsleft  = 0;
963                  break;
964         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
965                  inputcol    += L1_rowsleft;
966                  rowsleft    -= L1_rowsleft;
967                  L1_rowsleft  = 0;
968                  break;
969         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
970                  inputcol    += L1_rowsleft;
971                  rowsleft    -= L1_rowsleft;
972                  L1_rowsleft  = 0;
973                  break;
974         default: MatMatMultSymbolic_RowMergeMacro(8);
975                  inputcol    += 8;
976                  rowsleft    -= 8;
977                  L1_rowsleft -= 8;
978                  break;
979         }
980         inputcol_L1           = inputcol;
981         L1_nnz               += outputi_nnz;
982         worki_L1[++L1_nrows]  = L1_nnz;
983       }
984 
985       /********************** L E V E L  2 ************************/
986       /* Merge from L1 work array to either C or to L2 work array */
987       if (anzi > 8) {
988         inputi      = worki_L1;
989         inputj      = workj_L1;
990         inputcol    = workcol;
991         outputi_nnz = 0;
992 
993         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
994         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
995 
996         switch (L1_nrows) {
997         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
998                  brow_end[0] = inputj + inputi[inputcol[0]+1];
999                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1000                  break;
1001         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1002         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1003         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1004         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1005         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1006         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1007         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1008         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1009         }
1010         L2_nnz               += outputi_nnz;
1011         worki_L2[++L2_nrows]  = L2_nnz;
1012 
1013         /************************ L E V E L  3 **********************/
1014         /* Merge from L2 work array to either C or to L2 work array */
1015         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1016           inputi      = worki_L2;
1017           inputj      = workj_L2;
1018           inputcol    = workcol;
1019           outputi_nnz = 0;
1020           if (rowsleft) outputj = workj_L3;
1021           else          outputj = cj + ci_nnz;
1022           switch (L2_nrows) {
1023           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1024                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1025                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1026                    break;
1027           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1028           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1029           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1030           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1031           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1032           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1033           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1034           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1035           }
1036           L2_nrows    = 1;
1037           L2_nnz      = outputi_nnz;
1038           worki_L2[1] = outputi_nnz;
1039           /* Copy to workj_L2 */
1040           if (rowsleft) {
1041             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1042           }
1043         }
1044       }
1045     }  /* while (rowsleft) */
1046 #undef MatMatMultSymbolic_RowMergeMacro
1047 
1048     /* terminate current row */
1049     ci_nnz += outputi_nnz;
1050     ci[i+1] = ci_nnz;
1051   }
1052 
1053   /* Step 3: Create the new symbolic matrix */
1054   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1055   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1056 
1057   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1058   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1059   c          = (Mat_SeqAIJ*)(C->data);
1060   c->free_a  = PETSC_TRUE;
1061   c->free_ij = PETSC_TRUE;
1062   c->nonew   = 0;
1063 
1064   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1065 
1066   /* set MatInfo */
1067   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1068   if (afill < 1.0) afill = 1.0;
1069   C->info.mallocs           = ndouble;
1070   C->info.fill_ratio_given  = fill;
1071   C->info.fill_ratio_needed = afill;
1072 
1073 #if defined(PETSC_USE_INFO)
1074   if (ci[am]) {
1075     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1076     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1077   } else {
1078     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1079   }
1080 #endif
1081 
1082   /* Step 4: Free temporary work areas */
1083   PetscCall(PetscFree(workj_L1));
1084   PetscCall(PetscFree(workj_L2));
1085   PetscCall(PetscFree(workj_L3));
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* concatenate unique entries and then sort */
1090 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1091 {
1092   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1093   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1094   PetscInt       *ci,*cj,bcol;
1095   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1096   PetscReal      afill;
1097   PetscInt       i,j,ndouble = 0;
1098   PetscSegBuffer seg,segrow;
1099   char           *seen;
1100 
1101   PetscFunctionBegin;
1102   PetscCall(PetscMalloc1(am+1,&ci));
1103   ci[0] = 0;
1104 
1105   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1106   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1107   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1108   PetscCall(PetscCalloc1(bn,&seen));
1109 
1110   /* Determine ci and cj */
1111   for (i=0; i<am; i++) {
1112     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1113     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1114     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1115 
1116     /* Pack segrow */
1117     for (j=0; j<anzi; j++) {
1118       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1119       for (k=bjstart; k<bjend; k++) {
1120         bcol = bj[k];
1121         if (!seen[bcol]) { /* new entry */
1122           PetscInt *PETSC_RESTRICT slot;
1123           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1124           *slot = bcol;
1125           seen[bcol] = 1;
1126           packlen++;
1127         }
1128       }
1129     }
1130 
1131     /* Check i-th diagonal entry */
1132     if (C->force_diagonals && !seen[i]) {
1133       PetscInt *PETSC_RESTRICT slot;
1134       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1135       *slot   = i;
1136       seen[i] = 1;
1137       packlen++;
1138     }
1139 
1140     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1141     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1142     PetscCall(PetscSortInt(packlen,crow));
1143     ci[i+1] = ci[i] + packlen;
1144     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1145   }
1146   PetscCall(PetscSegBufferDestroy(&segrow));
1147   PetscCall(PetscFree(seen));
1148 
1149   /* Column indices are in the segmented buffer */
1150   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1151   PetscCall(PetscSegBufferDestroy(&seg));
1152 
1153   /* put together the new symbolic matrix */
1154   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1155   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1156 
1157   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1158   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1159   c          = (Mat_SeqAIJ*)(C->data);
1160   c->free_a  = PETSC_TRUE;
1161   c->free_ij = PETSC_TRUE;
1162   c->nonew   = 0;
1163 
1164   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1165 
1166   /* set MatInfo */
1167   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1168   if (afill < 1.0) afill = 1.0;
1169   C->info.mallocs           = ndouble;
1170   C->info.fill_ratio_given  = fill;
1171   C->info.fill_ratio_needed = afill;
1172 
1173 #if defined(PETSC_USE_INFO)
1174   if (ci[am]) {
1175     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1176     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1177   } else {
1178     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1179   }
1180 #endif
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1185 {
1186   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1190   PetscCall(MatDestroy(&abt->Bt_den));
1191   PetscCall(MatDestroy(&abt->ABt_den));
1192   PetscCall(PetscFree(abt));
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1197 {
1198   Mat                 Bt;
1199   PetscInt            *bti,*btj;
1200   Mat_MatMatTransMult *abt;
1201   Mat_Product         *product = C->product;
1202   char                *alg;
1203 
1204   PetscFunctionBegin;
1205   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1206   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1207 
1208   /* create symbolic Bt */
1209   PetscCall(MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1210   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt));
1211   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1212   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1213 
1214   /* get symbolic C=A*Bt */
1215   PetscCall(PetscStrallocpy(product->alg,&alg));
1216   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1217   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1218   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1219   PetscCall(PetscFree(alg));
1220 
1221   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1222   PetscCall(PetscNew(&abt));
1223 
1224   product->data    = abt;
1225   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1226 
1227   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1228 
1229   abt->usecoloring = PETSC_FALSE;
1230   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1231   if (abt->usecoloring) {
1232     /* Create MatTransposeColoring from symbolic C=A*B^T */
1233     MatTransposeColoring matcoloring;
1234     MatColoring          coloring;
1235     ISColoring           iscoloring;
1236     Mat                  Bt_dense,C_dense;
1237 
1238     /* inode causes memory problem */
1239     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1240 
1241     PetscCall(MatColoringCreate(C,&coloring));
1242     PetscCall(MatColoringSetDistance(coloring,2));
1243     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1244     PetscCall(MatColoringSetFromOptions(coloring));
1245     PetscCall(MatColoringApply(coloring,&iscoloring));
1246     PetscCall(MatColoringDestroy(&coloring));
1247     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1248 
1249     abt->matcoloring = matcoloring;
1250 
1251     PetscCall(ISColoringDestroy(&iscoloring));
1252 
1253     /* Create Bt_dense and C_dense = A*Bt_dense */
1254     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1255     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1256     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1257     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1258 
1259     Bt_dense->assembled = PETSC_TRUE;
1260     abt->Bt_den         = Bt_dense;
1261 
1262     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1263     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1264     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1265     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1266 
1267     Bt_dense->assembled = PETSC_TRUE;
1268     abt->ABt_den  = C_dense;
1269 
1270 #if defined(PETSC_USE_INFO)
1271     {
1272       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1273       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1274     }
1275 #endif
1276   }
1277   /* clean up */
1278   PetscCall(MatDestroy(&Bt));
1279   PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1284 {
1285   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1286   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1287   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1288   PetscLogDouble      flops=0.0;
1289   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1290   Mat_MatMatTransMult *abt;
1291   Mat_Product         *product = C->product;
1292 
1293   PetscFunctionBegin;
1294   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1295   abt = (Mat_MatMatTransMult *)product->data;
1296   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1297   /* clear old values in C */
1298   if (!c->a) {
1299     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1300     c->a      = ca;
1301     c->free_a = PETSC_TRUE;
1302   } else {
1303     ca =  c->a;
1304     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1305   }
1306 
1307   if (abt->usecoloring) {
1308     MatTransposeColoring matcoloring = abt->matcoloring;
1309     Mat                  Bt_dense,C_dense = abt->ABt_den;
1310 
1311     /* Get Bt_dense by Apply MatTransposeColoring to B */
1312     Bt_dense = abt->Bt_den;
1313     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1314 
1315     /* C_dense = A*Bt_dense */
1316     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1317 
1318     /* Recover C from C_dense */
1319     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1320     PetscFunctionReturn(0);
1321   }
1322 
1323   for (i=0; i<cm; i++) {
1324     anzi = ai[i+1] - ai[i];
1325     acol = aj + ai[i];
1326     aval = aa + ai[i];
1327     cnzi = ci[i+1] - ci[i];
1328     ccol = cj + ci[i];
1329     cval = ca + ci[i];
1330     for (j=0; j<cnzi; j++) {
1331       brow = ccol[j];
1332       bnzj = bi[brow+1] - bi[brow];
1333       bcol = bj + bi[brow];
1334       bval = ba + bi[brow];
1335 
1336       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1337       nexta = 0; nextb = 0;
1338       while (nexta<anzi && nextb<bnzj) {
1339         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1340         if (nexta == anzi) break;
1341         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1342         if (nextb == bnzj) break;
1343         if (acol[nexta] == bcol[nextb]) {
1344           cval[j] += aval[nexta]*bval[nextb];
1345           nexta++; nextb++;
1346           flops += 2;
1347         }
1348       }
1349     }
1350   }
1351   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1352   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1353   PetscCall(PetscLogFlops(flops));
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1358 {
1359   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1360 
1361   PetscFunctionBegin;
1362   PetscCall(MatDestroy(&atb->At));
1363   if (atb->destroy) {
1364     PetscCall((*atb->destroy)(atb->data));
1365   }
1366   PetscCall(PetscFree(atb));
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1371 {
1372   Mat            At = NULL;
1373   PetscInt       *ati,*atj;
1374   Mat_Product    *product = C->product;
1375   PetscBool      flg,def,square;
1376 
1377   PetscFunctionBegin;
1378   MatCheckProduct(C,4);
1379   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
1380   /* outerproduct */
1381   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1382   if (flg) {
1383     /* create symbolic At */
1384     if (!square) {
1385       PetscCall(MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1386       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At));
1387       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1388       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1389     }
1390     /* get symbolic C=At*B */
1391     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1392     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1393 
1394     /* clean up */
1395     if (!square) {
1396       PetscCall(MatDestroy(&At));
1397       PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1398     }
1399 
1400     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1401     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1402     PetscFunctionReturn(0);
1403   }
1404 
1405   /* matmatmult */
1406   PetscCall(PetscStrcmp(product->alg,"default",&def));
1407   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1408   if (flg || def) {
1409     Mat_MatTransMatMult *atb;
1410 
1411     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1412     PetscCall(PetscNew(&atb));
1413     if (!square) {
1414       PetscCall(MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At));
1415     }
1416     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1417     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1418     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1419     product->data    = atb;
1420     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1421     atb->At          = At;
1422     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
1423 
1424     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1425     PetscFunctionReturn(0);
1426   }
1427 
1428   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1429 }
1430 
1431 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1432 {
1433   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1434   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1435   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1436   PetscLogDouble flops=0.0;
1437   MatScalar      *aa=a->a,*ba,*ca,*caj;
1438 
1439   PetscFunctionBegin;
1440   if (!c->a) {
1441     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1442 
1443     c->a      = ca;
1444     c->free_a = PETSC_TRUE;
1445   } else {
1446     ca   = c->a;
1447     PetscCall(PetscArrayzero(ca,ci[cm]));
1448   }
1449 
1450   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1451   for (i=0; i<am; i++) {
1452     bj   = b->j + bi[i];
1453     ba   = b->a + bi[i];
1454     bnzi = bi[i+1] - bi[i];
1455     anzi = ai[i+1] - ai[i];
1456     for (j=0; j<anzi; j++) {
1457       nextb = 0;
1458       crow  = *aj++;
1459       cjj   = cj + ci[crow];
1460       caj   = ca + ci[crow];
1461       /* perform sparse axpy operation.  Note cjj includes bj. */
1462       for (k=0; nextb<bnzi; k++) {
1463         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1464           caj[k] += (*aa)*(*(ba+nextb));
1465           nextb++;
1466         }
1467       }
1468       flops += 2*bnzi;
1469       aa++;
1470     }
1471   }
1472 
1473   /* Assemble the final matrix and clean up */
1474   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1475   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1476   PetscCall(PetscLogFlops(flops));
1477   PetscFunctionReturn(0);
1478 }
1479 
1480 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1481 {
1482   PetscFunctionBegin;
1483   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1484   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1489 {
1490   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1491   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1492   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1493   const PetscInt    *aj;
1494   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1495   PetscInt          clda;
1496   PetscInt          am4,bm4,col,i,j,n;
1497 
1498   PetscFunctionBegin;
1499   if (!cm || !cn) PetscFunctionReturn(0);
1500   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1501   if (add) {
1502     PetscCall(MatDenseGetArray(C,&c));
1503   } else {
1504     PetscCall(MatDenseGetArrayWrite(C,&c));
1505   }
1506   PetscCall(MatDenseGetArrayRead(B,&b));
1507   PetscCall(MatDenseGetLDA(B,&bm));
1508   PetscCall(MatDenseGetLDA(C,&clda));
1509   am4 = 4*clda;
1510   bm4 = 4*bm;
1511   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1512   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1513   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1514     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1515       r1 = r2 = r3 = r4 = 0.0;
1516       n  = a->i[i+1] - a->i[i];
1517       aj = a->j + a->i[i];
1518       aa = av + a->i[i];
1519       for (j=0; j<n; j++) {
1520         const PetscScalar aatmp = aa[j];
1521         const PetscInt    ajtmp = aj[j];
1522         r1 += aatmp*b1[ajtmp];
1523         r2 += aatmp*b2[ajtmp];
1524         r3 += aatmp*b3[ajtmp];
1525         r4 += aatmp*b4[ajtmp];
1526       }
1527       if (add) {
1528         c1[i] += r1;
1529         c2[i] += r2;
1530         c3[i] += r3;
1531         c4[i] += r4;
1532       } else {
1533         c1[i] = r1;
1534         c2[i] = r2;
1535         c3[i] = r3;
1536         c4[i] = r4;
1537       }
1538     }
1539     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1540     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1541   }
1542   /* process remaining columns */
1543   if (col != cn) {
1544     PetscInt rc = cn-col;
1545 
1546     if (rc == 1) {
1547       for (i=0; i<am; i++) {
1548         r1 = 0.0;
1549         n  = a->i[i+1] - a->i[i];
1550         aj = a->j + a->i[i];
1551         aa = av + a->i[i];
1552         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1553         if (add) c1[i] += r1;
1554         else c1[i] = r1;
1555       }
1556     } else if (rc == 2) {
1557       for (i=0; i<am; i++) {
1558         r1 = r2 = 0.0;
1559         n  = a->i[i+1] - a->i[i];
1560         aj = a->j + a->i[i];
1561         aa = av + a->i[i];
1562         for (j=0; j<n; j++) {
1563           const PetscScalar aatmp = aa[j];
1564           const PetscInt    ajtmp = aj[j];
1565           r1 += aatmp*b1[ajtmp];
1566           r2 += aatmp*b2[ajtmp];
1567         }
1568         if (add) {
1569           c1[i] += r1;
1570           c2[i] += r2;
1571         } else {
1572           c1[i] = r1;
1573           c2[i] = r2;
1574         }
1575       }
1576     } else {
1577       for (i=0; i<am; i++) {
1578         r1 = r2 = r3 = 0.0;
1579         n  = a->i[i+1] - a->i[i];
1580         aj = a->j + a->i[i];
1581         aa = av + a->i[i];
1582         for (j=0; j<n; j++) {
1583           const PetscScalar aatmp = aa[j];
1584           const PetscInt    ajtmp = aj[j];
1585           r1 += aatmp*b1[ajtmp];
1586           r2 += aatmp*b2[ajtmp];
1587           r3 += aatmp*b3[ajtmp];
1588         }
1589         if (add) {
1590           c1[i] += r1;
1591           c2[i] += r2;
1592           c3[i] += r3;
1593         } else {
1594           c1[i] = r1;
1595           c2[i] = r2;
1596           c3[i] = r3;
1597         }
1598       }
1599     }
1600   }
1601   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1602   if (add) {
1603     PetscCall(MatDenseRestoreArray(C,&c));
1604   } else {
1605     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1606   }
1607   PetscCall(MatDenseRestoreArrayRead(B,&b));
1608   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1613 {
1614   PetscFunctionBegin;
1615   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1616   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1617   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1618 
1619   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 /* ------------------------------------------------------- */
1624 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1625 {
1626   PetscFunctionBegin;
1627   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1628   C->ops->productsymbolic = MatProductSymbolic_AB;
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1633 
1634 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1635 {
1636   PetscFunctionBegin;
1637   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1638   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1643 {
1644   PetscFunctionBegin;
1645   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1646   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1651 {
1652   Mat_Product    *product = C->product;
1653 
1654   PetscFunctionBegin;
1655   switch (product->type) {
1656   case MATPRODUCT_AB:
1657     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1658     break;
1659   case MATPRODUCT_AtB:
1660     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1661     break;
1662   case MATPRODUCT_ABt:
1663     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1664     break;
1665   default:
1666     break;
1667   }
1668   PetscFunctionReturn(0);
1669 }
1670 /* ------------------------------------------------------- */
1671 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1672 {
1673   Mat_Product    *product = C->product;
1674   Mat            A = product->A;
1675   PetscBool      baij;
1676 
1677   PetscFunctionBegin;
1678   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1679   if (!baij) { /* A is seqsbaij */
1680     PetscBool sbaij;
1681     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1682     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1683 
1684     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1685   } else { /* A is seqbaij */
1686     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1687   }
1688 
1689   C->ops->productsymbolic = MatProductSymbolic_AB;
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1694 {
1695   Mat_Product    *product = C->product;
1696 
1697   PetscFunctionBegin;
1698   MatCheckProduct(C,1);
1699   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1700   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
1701     PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1702   }
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 /* ------------------------------------------------------- */
1707 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1708 {
1709   PetscFunctionBegin;
1710   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1711   C->ops->productsymbolic = MatProductSymbolic_AB;
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1716 {
1717   Mat_Product    *product = C->product;
1718 
1719   PetscFunctionBegin;
1720   if (product->type == MATPRODUCT_AB) {
1721     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1722   }
1723   PetscFunctionReturn(0);
1724 }
1725 /* ------------------------------------------------------- */
1726 
1727 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1728 {
1729   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1730   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1731   PetscInt       *bi      = b->i,*bj=b->j;
1732   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1733   MatScalar      *btval,*btval_den,*ba=b->a;
1734   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1735 
1736   PetscFunctionBegin;
1737   btval_den=btdense->v;
1738   PetscCall(PetscArrayzero(btval_den,m*n));
1739   for (k=0; k<ncolors; k++) {
1740     ncolumns = coloring->ncolumns[k];
1741     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1742       col   = *(columns + colorforcol[k] + l);
1743       btcol = bj + bi[col];
1744       btval = ba + bi[col];
1745       anz   = bi[col+1] - bi[col];
1746       for (j=0; j<anz; j++) {
1747         brow            = btcol[j];
1748         btval_den[brow] = btval[j];
1749       }
1750     }
1751     btval_den += m;
1752   }
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1757 {
1758   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1759   const PetscScalar *ca_den,*ca_den_ptr;
1760   PetscScalar       *ca=csp->a;
1761   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1762   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1763   PetscInt          nrows,*row,*idx;
1764   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1765 
1766   PetscFunctionBegin;
1767   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1768 
1769   if (brows > 0) {
1770     PetscInt *lstart,row_end,row_start;
1771     lstart = matcoloring->lstart;
1772     PetscCall(PetscArrayzero(lstart,ncolors));
1773 
1774     row_end = brows;
1775     if (row_end > m) row_end = m;
1776     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1777       ca_den_ptr = ca_den;
1778       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1779         nrows = matcoloring->nrows[k];
1780         row   = rows  + colorforrow[k];
1781         idx   = den2sp + colorforrow[k];
1782         for (l=lstart[k]; l<nrows; l++) {
1783           if (row[l] >= row_end) {
1784             lstart[k] = l;
1785             break;
1786           } else {
1787             ca[idx[l]] = ca_den_ptr[row[l]];
1788           }
1789         }
1790         ca_den_ptr += m;
1791       }
1792       row_end += brows;
1793       if (row_end > m) row_end = m;
1794     }
1795   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1796     ca_den_ptr = ca_den;
1797     for (k=0; k<ncolors; k++) {
1798       nrows = matcoloring->nrows[k];
1799       row   = rows  + colorforrow[k];
1800       idx   = den2sp + colorforrow[k];
1801       for (l=0; l<nrows; l++) {
1802         ca[idx[l]] = ca_den_ptr[row[l]];
1803       }
1804       ca_den_ptr += m;
1805     }
1806   }
1807 
1808   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1809 #if defined(PETSC_USE_INFO)
1810   if (matcoloring->brows > 0) {
1811     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1812   } else {
1813     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1814   }
1815 #endif
1816   PetscFunctionReturn(0);
1817 }
1818 
1819 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1820 {
1821   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1822   const PetscInt *is,*ci,*cj,*row_idx;
1823   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1824   IS             *isa;
1825   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1826   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1827   PetscInt       *colorforcol,*columns,*columns_i,brows;
1828   PetscBool      flg;
1829 
1830   PetscFunctionBegin;
1831   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1832 
1833   /* bs >1 is not being tested yet! */
1834   Nbs       = mat->cmap->N/bs;
1835   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1836   c->N      = Nbs;
1837   c->m      = c->M;
1838   c->rstart = 0;
1839   c->brows  = 100;
1840 
1841   c->ncolors = nis;
1842   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1843   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1844   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1845 
1846   brows = c->brows;
1847   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1848   if (flg) c->brows = brows;
1849   if (brows > 0) {
1850     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1851   }
1852 
1853   colorforrow[0] = 0;
1854   rows_i         = rows;
1855   den2sp_i       = den2sp;
1856 
1857   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1858   PetscCall(PetscMalloc1(Nbs+1,&columns));
1859 
1860   colorforcol[0] = 0;
1861   columns_i      = columns;
1862 
1863   /* get column-wise storage of mat */
1864   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1865 
1866   cm   = c->m;
1867   PetscCall(PetscMalloc1(cm+1,&rowhit));
1868   PetscCall(PetscMalloc1(cm+1,&idxhit));
1869   for (i=0; i<nis; i++) { /* loop over color */
1870     PetscCall(ISGetLocalSize(isa[i],&n));
1871     PetscCall(ISGetIndices(isa[i],&is));
1872 
1873     c->ncolumns[i] = n;
1874     if (n) {
1875       PetscCall(PetscArraycpy(columns_i,is,n));
1876     }
1877     colorforcol[i+1] = colorforcol[i] + n;
1878     columns_i       += n;
1879 
1880     /* fast, crude version requires O(N*N) work */
1881     PetscCall(PetscArrayzero(rowhit,cm));
1882 
1883     for (j=0; j<n; j++) { /* loop over columns*/
1884       col     = is[j];
1885       row_idx = cj + ci[col];
1886       m       = ci[col+1] - ci[col];
1887       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1888         idxhit[*row_idx]   = spidx[ci[col] + k];
1889         rowhit[*row_idx++] = col + 1;
1890       }
1891     }
1892     /* count the number of hits */
1893     nrows = 0;
1894     for (j=0; j<cm; j++) {
1895       if (rowhit[j]) nrows++;
1896     }
1897     c->nrows[i]      = nrows;
1898     colorforrow[i+1] = colorforrow[i] + nrows;
1899 
1900     nrows = 0;
1901     for (j=0; j<cm; j++) { /* loop over rows */
1902       if (rowhit[j]) {
1903         rows_i[nrows]   = j;
1904         den2sp_i[nrows] = idxhit[j];
1905         nrows++;
1906       }
1907     }
1908     den2sp_i += nrows;
1909 
1910     PetscCall(ISRestoreIndices(isa[i],&is));
1911     rows_i += nrows;
1912   }
1913   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1914   PetscCall(PetscFree(rowhit));
1915   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1916   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1917 
1918   c->colorforrow = colorforrow;
1919   c->rows        = rows;
1920   c->den2sp      = den2sp;
1921   c->colorforcol = colorforcol;
1922   c->columns     = columns;
1923 
1924   PetscCall(PetscFree(idxhit));
1925   PetscFunctionReturn(0);
1926 }
1927 
1928 /* --------------------------------------------------------------- */
1929 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1930 {
1931   Mat_Product    *product = C->product;
1932   Mat            A=product->A,B=product->B;
1933 
1934   PetscFunctionBegin;
1935   if (C->ops->mattransposemultnumeric) {
1936     /* Alg: "outerproduct" */
1937     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1938   } else {
1939     /* Alg: "matmatmult" -- C = At*B */
1940     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1941     Mat                 At;
1942 
1943     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1944     At = atb->At;
1945     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1946       PetscCall(MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At));
1947     }
1948     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C));
1949     atb->updateAt = PETSC_TRUE;
1950   }
1951   PetscFunctionReturn(0);
1952 }
1953 
1954 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1955 {
1956   Mat_Product    *product = C->product;
1957   Mat            A=product->A,B=product->B;
1958   PetscReal      fill=product->fill;
1959 
1960   PetscFunctionBegin;
1961   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1962 
1963   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 /* --------------------------------------------------------------- */
1968 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1969 {
1970   Mat_Product    *product = C->product;
1971   PetscInt       alg = 0; /* default algorithm */
1972   PetscBool      flg = PETSC_FALSE;
1973 #if !defined(PETSC_HAVE_HYPRE)
1974   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1975   PetscInt       nalg = 7;
1976 #else
1977   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1978   PetscInt       nalg = 8;
1979 #endif
1980 
1981   PetscFunctionBegin;
1982   /* Set default algorithm */
1983   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1984   if (flg) {
1985     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1986   }
1987 
1988   /* Get runtime option */
1989   if (product->api_user) {
1990     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1991     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1992     PetscOptionsEnd();
1993   } else {
1994     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1995     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1996     PetscOptionsEnd();
1997   }
1998   if (flg) {
1999     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2000   }
2001 
2002   C->ops->productsymbolic = MatProductSymbolic_AB;
2003   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2008 {
2009   Mat_Product    *product = C->product;
2010   PetscInt       alg = 0; /* default algorithm */
2011   PetscBool      flg = PETSC_FALSE;
2012   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2013   PetscInt       nalg = 3;
2014 
2015   PetscFunctionBegin;
2016   /* Get runtime option */
2017   if (product->api_user) {
2018     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2019     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2020     PetscOptionsEnd();
2021   } else {
2022     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2023     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2024     PetscOptionsEnd();
2025   }
2026   if (flg) {
2027     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2028   }
2029 
2030   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2031   PetscFunctionReturn(0);
2032 }
2033 
2034 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2035 {
2036   Mat_Product    *product = C->product;
2037   PetscInt       alg = 0; /* default algorithm */
2038   PetscBool      flg = PETSC_FALSE;
2039   const char     *algTypes[2] = {"default","color"};
2040   PetscInt       nalg = 2;
2041 
2042   PetscFunctionBegin;
2043   /* Set default algorithm */
2044   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2045   if (!flg) {
2046     alg = 1;
2047     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2048   }
2049 
2050   /* Get runtime option */
2051   if (product->api_user) {
2052     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2053     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2054     PetscOptionsEnd();
2055   } else {
2056     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2057     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2058     PetscOptionsEnd();
2059   }
2060   if (flg) {
2061     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2062   }
2063 
2064   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2065   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2066   PetscFunctionReturn(0);
2067 }
2068 
2069 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2070 {
2071   Mat_Product    *product = C->product;
2072   PetscBool      flg = PETSC_FALSE;
2073   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2074 #if !defined(PETSC_HAVE_HYPRE)
2075   const char     *algTypes[2] = {"scalable","rap"};
2076   PetscInt       nalg = 2;
2077 #else
2078   const char     *algTypes[3] = {"scalable","rap","hypre"};
2079   PetscInt       nalg = 3;
2080 #endif
2081 
2082   PetscFunctionBegin;
2083   /* Set default algorithm */
2084   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2085   if (flg) {
2086     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2087   }
2088 
2089   /* Get runtime option */
2090   if (product->api_user) {
2091     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2092     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2093     PetscOptionsEnd();
2094   } else {
2095     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2096     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2097     PetscOptionsEnd();
2098   }
2099   if (flg) {
2100     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2101   }
2102 
2103   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2104   PetscFunctionReturn(0);
2105 }
2106 
2107 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2108 {
2109   Mat_Product    *product = C->product;
2110   PetscBool      flg = PETSC_FALSE;
2111   PetscInt       alg = 0; /* default algorithm */
2112   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2113   PetscInt        nalg = 3;
2114 
2115   PetscFunctionBegin;
2116   /* Set default algorithm */
2117   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2118   if (flg) {
2119     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2120   }
2121 
2122   /* Get runtime option */
2123   if (product->api_user) {
2124     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2125     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2126     PetscOptionsEnd();
2127   } else {
2128     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2129     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2130     PetscOptionsEnd();
2131   }
2132   if (flg) {
2133     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2134   }
2135 
2136   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2137   PetscFunctionReturn(0);
2138 }
2139 
2140 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2141 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2142 {
2143   Mat_Product    *product = C->product;
2144   PetscInt       alg = 0; /* default algorithm */
2145   PetscBool      flg = PETSC_FALSE;
2146   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2147   PetscInt       nalg = 7;
2148 
2149   PetscFunctionBegin;
2150   /* Set default algorithm */
2151   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2152   if (flg) {
2153     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2154   }
2155 
2156   /* Get runtime option */
2157   if (product->api_user) {
2158     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2159     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2160     PetscOptionsEnd();
2161   } else {
2162     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2163     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2164     PetscOptionsEnd();
2165   }
2166   if (flg) {
2167     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2168   }
2169 
2170   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2171   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2172   PetscFunctionReturn(0);
2173 }
2174 
2175 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2176 {
2177   Mat_Product    *product = C->product;
2178 
2179   PetscFunctionBegin;
2180   switch (product->type) {
2181   case MATPRODUCT_AB:
2182     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2183     break;
2184   case MATPRODUCT_AtB:
2185     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2186     break;
2187   case MATPRODUCT_ABt:
2188     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2189     break;
2190   case MATPRODUCT_PtAP:
2191     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2192     break;
2193   case MATPRODUCT_RARt:
2194     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2195     break;
2196   case MATPRODUCT_ABC:
2197     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2198     break;
2199   default:
2200     break;
2201   }
2202   PetscFunctionReturn(0);
2203 }
2204