xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision dbbe0bcd3f3a8fbab5a45420dc06f8387e5764c6)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A,B,C));
17   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
18   PetscFunctionReturn(0);
19 }
20 
21 /* Modified from MatCreateSeqAIJWithArrays() */
22 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
23 {
24   PetscInt       ii;
25   Mat_SeqAIJ     *aij;
26   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
27 
28   PetscFunctionBegin;
29   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
30   PetscCall(MatSetSizes(mat,m,n,m,n));
31 
32   if (!mtype) {
33     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
34     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
35   } else {
36     PetscCall(MatSetType(mat,mtype));
37   }
38 
39   aij  = (Mat_SeqAIJ*)(mat)->data;
40   osingle = aij->singlemalloc;
41   ofree_a = aij->free_a;
42   ofree_ij = aij->free_ij;
43   /* changes the free flags */
44   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
45 
46   PetscCall(PetscFree(aij->ilen));
47   PetscCall(PetscFree(aij->imax));
48   PetscCall(PetscMalloc1(m,&aij->imax));
49   PetscCall(PetscMalloc1(m,&aij->ilen));
50   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
51     const PetscInt rnz = i[ii+1] - i[ii];
52     aij->nonzerorowcnt += !!rnz;
53     aij->rmax = PetscMax(aij->rmax,rnz);
54     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
55   }
56   aij->maxnz = i[m];
57   aij->nz = i[m];
58 
59   if (osingle) {
60     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
61   } else {
62     if (ofree_a)  PetscCall(PetscFree(aij->a));
63     if (ofree_ij) PetscCall(PetscFree(aij->j));
64     if (ofree_ij) PetscCall(PetscFree(aij->i));
65   }
66   aij->i            = i;
67   aij->j            = j;
68   aij->a            = a;
69   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
70   /* default to not retain ownership */
71   aij->singlemalloc = PETSC_FALSE;
72   aij->free_a       = PETSC_FALSE;
73   aij->free_ij      = PETSC_FALSE;
74   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
75   PetscFunctionReturn(0);
76 }
77 
78 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
79 {
80   Mat_Product         *product = C->product;
81   MatProductAlgorithm alg;
82   PetscBool           flg;
83 
84   PetscFunctionBegin;
85   if (product) {
86     alg = product->alg;
87   } else {
88     alg = "sorted";
89   }
90   /* sorted */
91   PetscCall(PetscStrcmp(alg,"sorted",&flg));
92   if (flg) {
93     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
94     PetscFunctionReturn(0);
95   }
96 
97   /* scalable */
98   PetscCall(PetscStrcmp(alg,"scalable",&flg));
99   if (flg) {
100     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
101     PetscFunctionReturn(0);
102   }
103 
104   /* scalable_fast */
105   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
106   if (flg) {
107     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
108     PetscFunctionReturn(0);
109   }
110 
111   /* heap */
112   PetscCall(PetscStrcmp(alg,"heap",&flg));
113   if (flg) {
114     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
115     PetscFunctionReturn(0);
116   }
117 
118   /* btheap */
119   PetscCall(PetscStrcmp(alg,"btheap",&flg));
120   if (flg) {
121     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
122     PetscFunctionReturn(0);
123   }
124 
125   /* llcondensed */
126   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
127   if (flg) {
128     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
129     PetscFunctionReturn(0);
130   }
131 
132   /* rowmerge */
133   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
134   if (flg) {
135     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
136     PetscFunctionReturn(0);
137   }
138 
139 #if defined(PETSC_HAVE_HYPRE)
140   PetscCall(PetscStrcmp(alg,"hypre",&flg));
141   if (flg) {
142     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
143     PetscFunctionReturn(0);
144   }
145 #endif
146 
147   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
148 }
149 
150 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
151 {
152   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
153   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
154   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
155   PetscReal          afill;
156   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
157   PetscTable         ta;
158   PetscBT            lnkbt;
159   PetscFreeSpaceList free_space=NULL,current_space=NULL;
160 
161   PetscFunctionBegin;
162   /* Get ci and cj */
163   /*---------------*/
164   /* Allocate ci array, arrays for fill computation and */
165   /* free space for accumulating nonzero column info */
166   PetscCall(PetscMalloc1(am+2,&ci));
167   ci[0] = 0;
168 
169   /* create and initialize a linked list */
170   PetscCall(PetscTableCreate(bn,bn,&ta));
171   MatRowMergeMax_SeqAIJ(b,bm,ta);
172   PetscCall(PetscTableGetCount(ta,&Crmax));
173   PetscCall(PetscTableDestroy(&ta));
174 
175   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
176 
177   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
178   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
179 
180   current_space = free_space;
181 
182   /* Determine ci and cj */
183   for (i=0; i<am; i++) {
184     anzi = ai[i+1] - ai[i];
185     aj   = a->j + ai[i];
186     for (j=0; j<anzi; j++) {
187       brow = aj[j];
188       bnzj = bi[brow+1] - bi[brow];
189       bj   = b->j + bi[brow];
190       /* add non-zero cols of B into the sorted linked list lnk */
191       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
192     }
193     /* add possible missing diagonal entry */
194     if (C->force_diagonals) {
195       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
196     }
197     cnzi = lnk[0];
198 
199     /* If free space is not available, make more free space */
200     /* Double the amount of total space in the list */
201     if (current_space->local_remaining<cnzi) {
202       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
203       ndouble++;
204     }
205 
206     /* Copy data into free space, then initialize lnk */
207     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
208 
209     current_space->array           += cnzi;
210     current_space->local_used      += cnzi;
211     current_space->local_remaining -= cnzi;
212 
213     ci[i+1] = ci[i] + cnzi;
214   }
215 
216   /* Column indices are in the list of free space */
217   /* Allocate space for cj, initialize cj, and */
218   /* destroy list of free space and other temporary array(s) */
219   PetscCall(PetscMalloc1(ci[am]+1,&cj));
220   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
221   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
222 
223   /* put together the new symbolic matrix */
224   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
225   PetscCall(MatSetBlockSizesFromMats(C,A,B));
226 
227   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
228   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
229   c          = (Mat_SeqAIJ*)(C->data);
230   c->free_a  = PETSC_FALSE;
231   c->free_ij = PETSC_TRUE;
232   c->nonew   = 0;
233 
234   /* fast, needs non-scalable O(bn) array 'abdense' */
235   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
236 
237   /* set MatInfo */
238   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
239   if (afill < 1.0) afill = 1.0;
240   C->info.mallocs           = ndouble;
241   C->info.fill_ratio_given  = fill;
242   C->info.fill_ratio_needed = afill;
243 
244 #if defined(PETSC_USE_INFO)
245   if (ci[am]) {
246     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
247     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
248   } else {
249     PetscCall(PetscInfo(C,"Empty matrix product\n"));
250   }
251 #endif
252   PetscFunctionReturn(0);
253 }
254 
255 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
256 {
257   PetscLogDouble    flops=0.0;
258   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
259   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
260   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
261   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
262   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
263   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
264   PetscScalar       *ca,valtmp;
265   PetscScalar       *ab_dense;
266   PetscContainer    cab_dense;
267   const PetscScalar *aa,*ba,*baj;
268 
269   PetscFunctionBegin;
270   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
271   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
272   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
273     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
274     c->a      = ca;
275     c->free_a = PETSC_TRUE;
276   } else ca = c->a;
277 
278   /* TODO this should be done in the symbolic phase */
279   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
280      that is hard to eradicate) */
281   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
282   if (!cab_dense) {
283     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
284     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
285     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
286     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
287     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
288     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
289   }
290   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
291   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
292 
293   /* clean old values in C */
294   PetscCall(PetscArrayzero(ca,ci[cm]));
295   /* Traverse A row-wise. */
296   /* Build the ith row in C by summing over nonzero columns in A, */
297   /* the rows of B corresponding to nonzeros of A. */
298   for (i=0; i<am; i++) {
299     anzi = ai[i+1] - ai[i];
300     for (j=0; j<anzi; j++) {
301       brow = aj[j];
302       bnzi = bi[brow+1] - bi[brow];
303       bjj  = bj + bi[brow];
304       baj  = ba + bi[brow];
305       /* perform dense axpy */
306       valtmp = aa[j];
307       for (k=0; k<bnzi; k++) {
308         ab_dense[bjj[k]] += valtmp*baj[k];
309       }
310       flops += 2*bnzi;
311     }
312     aj += anzi; aa += anzi;
313 
314     cnzi = ci[i+1] - ci[i];
315     for (k=0; k<cnzi; k++) {
316       ca[k]          += ab_dense[cj[k]];
317       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
318     }
319     flops += cnzi;
320     cj    += cnzi; ca += cnzi;
321   }
322 #if defined(PETSC_HAVE_DEVICE)
323   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
324 #endif
325   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
326   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
327   PetscCall(PetscLogFlops(flops));
328   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
329   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
330   PetscFunctionReturn(0);
331 }
332 
333 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
334 {
335   PetscLogDouble    flops=0.0;
336   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
337   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
338   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
339   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
340   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
341   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
342   PetscScalar       *ca=c->a,valtmp;
343   const PetscScalar *aa,*ba,*baj;
344   PetscInt          nextb;
345 
346   PetscFunctionBegin;
347   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
348   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
349   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
350     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
351     c->a      = ca;
352     c->free_a = PETSC_TRUE;
353   }
354 
355   /* clean old values in C */
356   PetscCall(PetscArrayzero(ca,ci[cm]));
357   /* Traverse A row-wise. */
358   /* Build the ith row in C by summing over nonzero columns in A, */
359   /* the rows of B corresponding to nonzeros of A. */
360   for (i=0; i<am; i++) {
361     anzi = ai[i+1] - ai[i];
362     cnzi = ci[i+1] - ci[i];
363     for (j=0; j<anzi; j++) {
364       brow = aj[j];
365       bnzi = bi[brow+1] - bi[brow];
366       bjj  = bj + bi[brow];
367       baj  = ba + bi[brow];
368       /* perform sparse axpy */
369       valtmp = aa[j];
370       nextb  = 0;
371       for (k=0; nextb<bnzi; k++) {
372         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
373           ca[k] += valtmp*baj[nextb++];
374         }
375       }
376       flops += 2*bnzi;
377     }
378     aj += anzi; aa += anzi;
379     cj += cnzi; ca += cnzi;
380   }
381 #if defined(PETSC_HAVE_DEVICE)
382   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
383 #endif
384   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
385   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
386   PetscCall(PetscLogFlops(flops));
387   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
388   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
389   PetscFunctionReturn(0);
390 }
391 
392 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
393 {
394   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
395   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
396   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
397   MatScalar          *ca;
398   PetscReal          afill;
399   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
400   PetscTable         ta;
401   PetscFreeSpaceList free_space=NULL,current_space=NULL;
402 
403   PetscFunctionBegin;
404   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
405   /*-----------------------------------------------------------------------------------------*/
406   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
407   PetscCall(PetscMalloc1(am+2,&ci));
408   ci[0] = 0;
409 
410   /* create and initialize a linked list */
411   PetscCall(PetscTableCreate(bn,bn,&ta));
412   MatRowMergeMax_SeqAIJ(b,bm,ta);
413   PetscCall(PetscTableGetCount(ta,&Crmax));
414   PetscCall(PetscTableDestroy(&ta));
415 
416   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
417 
418   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
419   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
420   current_space = free_space;
421 
422   /* Determine ci and cj */
423   for (i=0; i<am; i++) {
424     anzi = ai[i+1] - ai[i];
425     aj   = a->j + ai[i];
426     for (j=0; j<anzi; j++) {
427       brow = aj[j];
428       bnzj = bi[brow+1] - bi[brow];
429       bj   = b->j + bi[brow];
430       /* add non-zero cols of B into the sorted linked list lnk */
431       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
432     }
433     /* add possible missing diagonal entry */
434     if (C->force_diagonals) {
435       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
436     }
437     cnzi = lnk[1];
438 
439     /* If free space is not available, make more free space */
440     /* Double the amount of total space in the list */
441     if (current_space->local_remaining<cnzi) {
442       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
443       ndouble++;
444     }
445 
446     /* Copy data into free space, then initialize lnk */
447     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
448 
449     current_space->array           += cnzi;
450     current_space->local_used      += cnzi;
451     current_space->local_remaining -= cnzi;
452 
453     ci[i+1] = ci[i] + cnzi;
454   }
455 
456   /* Column indices are in the list of free space */
457   /* Allocate space for cj, initialize cj, and */
458   /* destroy list of free space and other temporary array(s) */
459   PetscCall(PetscMalloc1(ci[am]+1,&cj));
460   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
461   PetscCall(PetscLLCondensedDestroy_fast(lnk));
462 
463   /* Allocate space for ca */
464   PetscCall(PetscCalloc1(ci[am]+1,&ca));
465 
466   /* put together the new symbolic matrix */
467   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
468   PetscCall(MatSetBlockSizesFromMats(C,A,B));
469 
470   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
471   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
472   c          = (Mat_SeqAIJ*)(C->data);
473   c->free_a  = PETSC_TRUE;
474   c->free_ij = PETSC_TRUE;
475   c->nonew   = 0;
476 
477   /* slower, less memory */
478   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
479 
480   /* set MatInfo */
481   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
482   if (afill < 1.0) afill = 1.0;
483   C->info.mallocs           = ndouble;
484   C->info.fill_ratio_given  = fill;
485   C->info.fill_ratio_needed = afill;
486 
487 #if defined(PETSC_USE_INFO)
488   if (ci[am]) {
489     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
490     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
491   } else {
492     PetscCall(PetscInfo(C,"Empty matrix product\n"));
493   }
494 #endif
495   PetscFunctionReturn(0);
496 }
497 
498 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
499 {
500   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
501   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
502   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
503   MatScalar          *ca;
504   PetscReal          afill;
505   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
506   PetscTable         ta;
507   PetscFreeSpaceList free_space=NULL,current_space=NULL;
508 
509   PetscFunctionBegin;
510   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
511   /*---------------------------------------------------------------------------------------------*/
512   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
513   PetscCall(PetscMalloc1(am+2,&ci));
514   ci[0] = 0;
515 
516   /* create and initialize a linked list */
517   PetscCall(PetscTableCreate(bn,bn,&ta));
518   MatRowMergeMax_SeqAIJ(b,bm,ta);
519   PetscCall(PetscTableGetCount(ta,&Crmax));
520   PetscCall(PetscTableDestroy(&ta));
521   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
522 
523   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
524   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
525   current_space = free_space;
526 
527   /* Determine ci and cj */
528   for (i=0; i<am; i++) {
529     anzi = ai[i+1] - ai[i];
530     aj   = a->j + ai[i];
531     for (j=0; j<anzi; j++) {
532       brow = aj[j];
533       bnzj = bi[brow+1] - bi[brow];
534       bj   = b->j + bi[brow];
535       /* add non-zero cols of B into the sorted linked list lnk */
536       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
537     }
538     /* add possible missing diagonal entry */
539     if (C->force_diagonals) {
540       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
541     }
542 
543     cnzi = lnk[0];
544 
545     /* If free space is not available, make more free space */
546     /* Double the amount of total space in the list */
547     if (current_space->local_remaining<cnzi) {
548       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
549       ndouble++;
550     }
551 
552     /* Copy data into free space, then initialize lnk */
553     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
554 
555     current_space->array           += cnzi;
556     current_space->local_used      += cnzi;
557     current_space->local_remaining -= cnzi;
558 
559     ci[i+1] = ci[i] + cnzi;
560   }
561 
562   /* Column indices are in the list of free space */
563   /* Allocate space for cj, initialize cj, and */
564   /* destroy list of free space and other temporary array(s) */
565   PetscCall(PetscMalloc1(ci[am]+1,&cj));
566   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
567   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
568 
569   /* Allocate space for ca */
570   /*-----------------------*/
571   PetscCall(PetscCalloc1(ci[am]+1,&ca));
572 
573   /* put together the new symbolic matrix */
574   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
575   PetscCall(MatSetBlockSizesFromMats(C,A,B));
576 
577   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
578   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
579   c          = (Mat_SeqAIJ*)(C->data);
580   c->free_a  = PETSC_TRUE;
581   c->free_ij = PETSC_TRUE;
582   c->nonew   = 0;
583 
584   /* slower, less memory */
585   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
586 
587   /* set MatInfo */
588   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
589   if (afill < 1.0) afill = 1.0;
590   C->info.mallocs           = ndouble;
591   C->info.fill_ratio_given  = fill;
592   C->info.fill_ratio_needed = afill;
593 
594 #if defined(PETSC_USE_INFO)
595   if (ci[am]) {
596     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
597     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
598   } else {
599     PetscCall(PetscInfo(C,"Empty matrix product\n"));
600   }
601 #endif
602   PetscFunctionReturn(0);
603 }
604 
605 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
606 {
607   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
608   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
609   PetscInt           *ci,*cj,*bb;
610   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
611   PetscReal          afill;
612   PetscInt           i,j,col,ndouble = 0;
613   PetscFreeSpaceList free_space=NULL,current_space=NULL;
614   PetscHeap          h;
615 
616   PetscFunctionBegin;
617   /* Get ci and cj - by merging sorted rows using a heap */
618   /*---------------------------------------------------------------------------------------------*/
619   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
620   PetscCall(PetscMalloc1(am+2,&ci));
621   ci[0] = 0;
622 
623   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
624   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
625   current_space = free_space;
626 
627   PetscCall(PetscHeapCreate(a->rmax,&h));
628   PetscCall(PetscMalloc1(a->rmax,&bb));
629 
630   /* Determine ci and cj */
631   for (i=0; i<am; i++) {
632     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
633     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
634     ci[i+1] = ci[i];
635     /* Populate the min heap */
636     for (j=0; j<anzi; j++) {
637       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
638       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
639         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
640       }
641     }
642     /* Pick off the min element, adding it to free space */
643     PetscCall(PetscHeapPop(h,&j,&col));
644     while (j >= 0) {
645       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
646         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
647         ndouble++;
648       }
649       *(current_space->array++) = col;
650       current_space->local_used++;
651       current_space->local_remaining--;
652       ci[i+1]++;
653 
654       /* stash if anything else remains in this row of B */
655       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
656       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
657         PetscInt j2,col2;
658         PetscCall(PetscHeapPeek(h,&j2,&col2));
659         if (col2 != col) break;
660         PetscCall(PetscHeapPop(h,&j2,&col2));
661         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
662       }
663       /* Put any stashed elements back into the min heap */
664       PetscCall(PetscHeapUnstash(h));
665       PetscCall(PetscHeapPop(h,&j,&col));
666     }
667   }
668   PetscCall(PetscFree(bb));
669   PetscCall(PetscHeapDestroy(&h));
670 
671   /* Column indices are in the list of free space */
672   /* Allocate space for cj, initialize cj, and */
673   /* destroy list of free space and other temporary array(s) */
674   PetscCall(PetscMalloc1(ci[am],&cj));
675   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
676 
677   /* put together the new symbolic matrix */
678   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
679   PetscCall(MatSetBlockSizesFromMats(C,A,B));
680 
681   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
682   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
683   c          = (Mat_SeqAIJ*)(C->data);
684   c->free_a  = PETSC_TRUE;
685   c->free_ij = PETSC_TRUE;
686   c->nonew   = 0;
687 
688   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
689 
690   /* set MatInfo */
691   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
692   if (afill < 1.0) afill = 1.0;
693   C->info.mallocs           = ndouble;
694   C->info.fill_ratio_given  = fill;
695   C->info.fill_ratio_needed = afill;
696 
697 #if defined(PETSC_USE_INFO)
698   if (ci[am]) {
699     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
700     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
701   } else {
702     PetscCall(PetscInfo(C,"Empty matrix product\n"));
703   }
704 #endif
705   PetscFunctionReturn(0);
706 }
707 
708 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
709 {
710   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
711   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
712   PetscInt           *ci,*cj,*bb;
713   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
714   PetscReal          afill;
715   PetscInt           i,j,col,ndouble = 0;
716   PetscFreeSpaceList free_space=NULL,current_space=NULL;
717   PetscHeap          h;
718   PetscBT            bt;
719 
720   PetscFunctionBegin;
721   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
722   /*---------------------------------------------------------------------------------------------*/
723   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
724   PetscCall(PetscMalloc1(am+2,&ci));
725   ci[0] = 0;
726 
727   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
728   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
729 
730   current_space = free_space;
731 
732   PetscCall(PetscHeapCreate(a->rmax,&h));
733   PetscCall(PetscMalloc1(a->rmax,&bb));
734   PetscCall(PetscBTCreate(bn,&bt));
735 
736   /* Determine ci and cj */
737   for (i=0; i<am; i++) {
738     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
739     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
740     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
741     ci[i+1] = ci[i];
742     /* Populate the min heap */
743     for (j=0; j<anzi; j++) {
744       PetscInt brow = acol[j];
745       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
746         PetscInt bcol = bj[bb[j]];
747         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
748           PetscCall(PetscHeapAdd(h,j,bcol));
749           bb[j]++;
750           break;
751         }
752       }
753     }
754     /* Pick off the min element, adding it to free space */
755     PetscCall(PetscHeapPop(h,&j,&col));
756     while (j >= 0) {
757       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
758         fptr = NULL;                      /* need PetscBTMemzero */
759         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
760         ndouble++;
761       }
762       *(current_space->array++) = col;
763       current_space->local_used++;
764       current_space->local_remaining--;
765       ci[i+1]++;
766 
767       /* stash if anything else remains in this row of B */
768       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
769         PetscInt bcol = bj[bb[j]];
770         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
771           PetscCall(PetscHeapAdd(h,j,bcol));
772           bb[j]++;
773           break;
774         }
775       }
776       PetscCall(PetscHeapPop(h,&j,&col));
777     }
778     if (fptr) {                 /* Clear the bits for this row */
779       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
780     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
781       PetscCall(PetscBTMemzero(bn,bt));
782     }
783   }
784   PetscCall(PetscFree(bb));
785   PetscCall(PetscHeapDestroy(&h));
786   PetscCall(PetscBTDestroy(&bt));
787 
788   /* Column indices are in the list of free space */
789   /* Allocate space for cj, initialize cj, and */
790   /* destroy list of free space and other temporary array(s) */
791   PetscCall(PetscMalloc1(ci[am],&cj));
792   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
793 
794   /* put together the new symbolic matrix */
795   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
796   PetscCall(MatSetBlockSizesFromMats(C,A,B));
797 
798   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
799   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
800   c          = (Mat_SeqAIJ*)(C->data);
801   c->free_a  = PETSC_TRUE;
802   c->free_ij = PETSC_TRUE;
803   c->nonew   = 0;
804 
805   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
806 
807   /* set MatInfo */
808   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
809   if (afill < 1.0) afill = 1.0;
810   C->info.mallocs           = ndouble;
811   C->info.fill_ratio_given  = fill;
812   C->info.fill_ratio_needed = afill;
813 
814 #if defined(PETSC_USE_INFO)
815   if (ci[am]) {
816     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
817     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
818   } else {
819     PetscCall(PetscInfo(C,"Empty matrix product\n"));
820   }
821 #endif
822   PetscFunctionReturn(0);
823 }
824 
825 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
826 {
827   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
828   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
829   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
830   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
831   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
832   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
833   const PetscInt     *brow_ptr[8],*brow_end[8];
834   PetscInt           window[8];
835   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
836   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
837   PetscReal          afill;
838   PetscInt           *workj_L1,*workj_L2,*workj_L3;
839   PetscInt           L1_nnz,L2_nnz;
840 
841   /* Step 1: Get upper bound on memory required for allocation.
842              Because of the way virtual memory works,
843              only the memory pages that are actually needed will be physically allocated. */
844   PetscFunctionBegin;
845   PetscCall(PetscMalloc1(am+1,&ci));
846   for (i=0; i<am; i++) {
847     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
848     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
849     a_rownnz = 0;
850     for (k=0; k<anzi; ++k) {
851       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
852       if (a_rownnz > bn) {
853         a_rownnz = bn;
854         break;
855       }
856     }
857     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
858   }
859   /* temporary work areas for merging rows */
860   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
861   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
862   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
863 
864   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
865   c_maxmem = 8*(ai[am]+bi[bm]);
866   /* Step 2: Populate pattern for C */
867   PetscCall(PetscMalloc1(c_maxmem,&cj));
868 
869   ci_nnz       = 0;
870   ci[0]        = 0;
871   worki_L1[0]  = 0;
872   worki_L2[0]  = 0;
873   for (i=0; i<am; i++) {
874     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
875     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
876     rowsleft             = anzi;
877     inputcol_L1          = acol;
878     L2_nnz               = 0;
879     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
880     worki_L2[1]          = 0;
881     outputi_nnz          = 0;
882 
883     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
884     while (ci_nnz+a_maxrownnz > c_maxmem) {
885       c_maxmem *= 2;
886       ndouble++;
887       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
888     }
889 
890     while (rowsleft) {
891       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
892       L1_nrows    = 0;
893       L1_nnz      = 0;
894       inputcol    = inputcol_L1;
895       inputi      = bi;
896       inputj      = bj;
897 
898       /* The following macro is used to specialize for small rows in A.
899          This helps with compiler unrolling, improving performance substantially.
900           Input:  inputj   inputi  inputcol  bn
901           Output: outputj  outputi_nnz                       */
902        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
903          window_min  = bn;                                                   \
904          outputi_nnz = 0;                                                    \
905          for (k=0; k<ANNZ; ++k) {                                            \
906            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
907            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
908            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
909            window_min  = PetscMin(window[k], window_min);                    \
910          }                                                                   \
911          while (window_min < bn) {                                           \
912            outputj[outputi_nnz++] = window_min;                              \
913            /* advance front and compute new minimum */                       \
914            old_window_min = window_min;                                      \
915            window_min = bn;                                                  \
916            for (k=0; k<ANNZ; ++k) {                                          \
917              if (window[k] == old_window_min) {                              \
918                brow_ptr[k]++;                                                \
919                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
920              }                                                               \
921              window_min = PetscMin(window[k], window_min);                   \
922            }                                                                 \
923          }
924 
925       /************** L E V E L  1 ***************/
926       /* Merge up to 8 rows of B to L1 work array*/
927       while (L1_rowsleft) {
928         outputi_nnz = 0;
929         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
930         else           outputj = cj + ci_nnz;           /* Merge directly to C */
931 
932         switch (L1_rowsleft) {
933         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
934                  brow_end[0] = inputj + inputi[inputcol[0]+1];
935                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
936                  inputcol    += L1_rowsleft;
937                  rowsleft    -= L1_rowsleft;
938                  L1_rowsleft  = 0;
939                  break;
940         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
941                  inputcol    += L1_rowsleft;
942                  rowsleft    -= L1_rowsleft;
943                  L1_rowsleft  = 0;
944                  break;
945         case 3: MatMatMultSymbolic_RowMergeMacro(3);
946                  inputcol    += L1_rowsleft;
947                  rowsleft    -= L1_rowsleft;
948                  L1_rowsleft  = 0;
949                  break;
950         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
951                  inputcol    += L1_rowsleft;
952                  rowsleft    -= L1_rowsleft;
953                  L1_rowsleft  = 0;
954                  break;
955         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
956                  inputcol    += L1_rowsleft;
957                  rowsleft    -= L1_rowsleft;
958                  L1_rowsleft  = 0;
959                  break;
960         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
961                  inputcol    += L1_rowsleft;
962                  rowsleft    -= L1_rowsleft;
963                  L1_rowsleft  = 0;
964                  break;
965         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
966                  inputcol    += L1_rowsleft;
967                  rowsleft    -= L1_rowsleft;
968                  L1_rowsleft  = 0;
969                  break;
970         default: MatMatMultSymbolic_RowMergeMacro(8);
971                  inputcol    += 8;
972                  rowsleft    -= 8;
973                  L1_rowsleft -= 8;
974                  break;
975         }
976         inputcol_L1           = inputcol;
977         L1_nnz               += outputi_nnz;
978         worki_L1[++L1_nrows]  = L1_nnz;
979       }
980 
981       /********************** L E V E L  2 ************************/
982       /* Merge from L1 work array to either C or to L2 work array */
983       if (anzi > 8) {
984         inputi      = worki_L1;
985         inputj      = workj_L1;
986         inputcol    = workcol;
987         outputi_nnz = 0;
988 
989         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
990         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
991 
992         switch (L1_nrows) {
993         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
994                  brow_end[0] = inputj + inputi[inputcol[0]+1];
995                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
996                  break;
997         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
998         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
999         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1000         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1001         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1002         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1003         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1004         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1005         }
1006         L2_nnz               += outputi_nnz;
1007         worki_L2[++L2_nrows]  = L2_nnz;
1008 
1009         /************************ L E V E L  3 **********************/
1010         /* Merge from L2 work array to either C or to L2 work array */
1011         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1012           inputi      = worki_L2;
1013           inputj      = workj_L2;
1014           inputcol    = workcol;
1015           outputi_nnz = 0;
1016           if (rowsleft) outputj = workj_L3;
1017           else          outputj = cj + ci_nnz;
1018           switch (L2_nrows) {
1019           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1020                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1021                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1022                    break;
1023           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1024           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1025           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1026           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1027           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1028           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1029           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1030           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1031           }
1032           L2_nrows    = 1;
1033           L2_nnz      = outputi_nnz;
1034           worki_L2[1] = outputi_nnz;
1035           /* Copy to workj_L2 */
1036           if (rowsleft) {
1037             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1038           }
1039         }
1040       }
1041     }  /* while (rowsleft) */
1042 #undef MatMatMultSymbolic_RowMergeMacro
1043 
1044     /* terminate current row */
1045     ci_nnz += outputi_nnz;
1046     ci[i+1] = ci_nnz;
1047   }
1048 
1049   /* Step 3: Create the new symbolic matrix */
1050   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1051   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1052 
1053   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1054   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1055   c          = (Mat_SeqAIJ*)(C->data);
1056   c->free_a  = PETSC_TRUE;
1057   c->free_ij = PETSC_TRUE;
1058   c->nonew   = 0;
1059 
1060   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1061 
1062   /* set MatInfo */
1063   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1064   if (afill < 1.0) afill = 1.0;
1065   C->info.mallocs           = ndouble;
1066   C->info.fill_ratio_given  = fill;
1067   C->info.fill_ratio_needed = afill;
1068 
1069 #if defined(PETSC_USE_INFO)
1070   if (ci[am]) {
1071     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1072     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1073   } else {
1074     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1075   }
1076 #endif
1077 
1078   /* Step 4: Free temporary work areas */
1079   PetscCall(PetscFree(workj_L1));
1080   PetscCall(PetscFree(workj_L2));
1081   PetscCall(PetscFree(workj_L3));
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 /* concatenate unique entries and then sort */
1086 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1087 {
1088   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1089   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1090   PetscInt       *ci,*cj,bcol;
1091   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1092   PetscReal      afill;
1093   PetscInt       i,j,ndouble = 0;
1094   PetscSegBuffer seg,segrow;
1095   char           *seen;
1096 
1097   PetscFunctionBegin;
1098   PetscCall(PetscMalloc1(am+1,&ci));
1099   ci[0] = 0;
1100 
1101   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1102   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1103   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1104   PetscCall(PetscCalloc1(bn,&seen));
1105 
1106   /* Determine ci and cj */
1107   for (i=0; i<am; i++) {
1108     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1109     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1110     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1111 
1112     /* Pack segrow */
1113     for (j=0; j<anzi; j++) {
1114       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1115       for (k=bjstart; k<bjend; k++) {
1116         bcol = bj[k];
1117         if (!seen[bcol]) { /* new entry */
1118           PetscInt *PETSC_RESTRICT slot;
1119           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1120           *slot = bcol;
1121           seen[bcol] = 1;
1122           packlen++;
1123         }
1124       }
1125     }
1126 
1127     /* Check i-th diagonal entry */
1128     if (C->force_diagonals && !seen[i]) {
1129       PetscInt *PETSC_RESTRICT slot;
1130       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1131       *slot   = i;
1132       seen[i] = 1;
1133       packlen++;
1134     }
1135 
1136     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1137     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1138     PetscCall(PetscSortInt(packlen,crow));
1139     ci[i+1] = ci[i] + packlen;
1140     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1141   }
1142   PetscCall(PetscSegBufferDestroy(&segrow));
1143   PetscCall(PetscFree(seen));
1144 
1145   /* Column indices are in the segmented buffer */
1146   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1147   PetscCall(PetscSegBufferDestroy(&seg));
1148 
1149   /* put together the new symbolic matrix */
1150   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1151   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1152 
1153   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1154   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1155   c          = (Mat_SeqAIJ*)(C->data);
1156   c->free_a  = PETSC_TRUE;
1157   c->free_ij = PETSC_TRUE;
1158   c->nonew   = 0;
1159 
1160   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1161 
1162   /* set MatInfo */
1163   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1164   if (afill < 1.0) afill = 1.0;
1165   C->info.mallocs           = ndouble;
1166   C->info.fill_ratio_given  = fill;
1167   C->info.fill_ratio_needed = afill;
1168 
1169 #if defined(PETSC_USE_INFO)
1170   if (ci[am]) {
1171     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1172     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1173   } else {
1174     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1175   }
1176 #endif
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1181 {
1182   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1183 
1184   PetscFunctionBegin;
1185   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1186   PetscCall(MatDestroy(&abt->Bt_den));
1187   PetscCall(MatDestroy(&abt->ABt_den));
1188   PetscCall(PetscFree(abt));
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1193 {
1194   Mat                 Bt;
1195   Mat_MatMatTransMult *abt;
1196   Mat_Product         *product = C->product;
1197   char                *alg;
1198 
1199   PetscFunctionBegin;
1200   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1201   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1202 
1203   /* create symbolic Bt */
1204   PetscCall(MatTransposeSymbolic(B,&Bt));
1205   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1206   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1207 
1208   /* get symbolic C=A*Bt */
1209   PetscCall(PetscStrallocpy(product->alg,&alg));
1210   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1211   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1212   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1213   PetscCall(PetscFree(alg));
1214 
1215   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1216   PetscCall(PetscNew(&abt));
1217 
1218   product->data    = abt;
1219   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1220 
1221   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1222 
1223   abt->usecoloring = PETSC_FALSE;
1224   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1225   if (abt->usecoloring) {
1226     /* Create MatTransposeColoring from symbolic C=A*B^T */
1227     MatTransposeColoring matcoloring;
1228     MatColoring          coloring;
1229     ISColoring           iscoloring;
1230     Mat                  Bt_dense,C_dense;
1231 
1232     /* inode causes memory problem */
1233     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1234 
1235     PetscCall(MatColoringCreate(C,&coloring));
1236     PetscCall(MatColoringSetDistance(coloring,2));
1237     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1238     PetscCall(MatColoringSetFromOptions(coloring));
1239     PetscCall(MatColoringApply(coloring,&iscoloring));
1240     PetscCall(MatColoringDestroy(&coloring));
1241     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1242 
1243     abt->matcoloring = matcoloring;
1244 
1245     PetscCall(ISColoringDestroy(&iscoloring));
1246 
1247     /* Create Bt_dense and C_dense = A*Bt_dense */
1248     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1249     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1250     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1251     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1252 
1253     Bt_dense->assembled = PETSC_TRUE;
1254     abt->Bt_den         = Bt_dense;
1255 
1256     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1257     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1258     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1259     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1260 
1261     Bt_dense->assembled = PETSC_TRUE;
1262     abt->ABt_den  = C_dense;
1263 
1264 #if defined(PETSC_USE_INFO)
1265     {
1266       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1267       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1268     }
1269 #endif
1270   }
1271   /* clean up */
1272   PetscCall(MatDestroy(&Bt));
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1277 {
1278   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1279   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1280   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1281   PetscLogDouble      flops=0.0;
1282   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1283   Mat_MatMatTransMult *abt;
1284   Mat_Product         *product = C->product;
1285 
1286   PetscFunctionBegin;
1287   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1288   abt = (Mat_MatMatTransMult *)product->data;
1289   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1290   /* clear old values in C */
1291   if (!c->a) {
1292     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1293     c->a      = ca;
1294     c->free_a = PETSC_TRUE;
1295   } else {
1296     ca =  c->a;
1297     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1298   }
1299 
1300   if (abt->usecoloring) {
1301     MatTransposeColoring matcoloring = abt->matcoloring;
1302     Mat                  Bt_dense,C_dense = abt->ABt_den;
1303 
1304     /* Get Bt_dense by Apply MatTransposeColoring to B */
1305     Bt_dense = abt->Bt_den;
1306     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1307 
1308     /* C_dense = A*Bt_dense */
1309     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1310 
1311     /* Recover C from C_dense */
1312     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1313     PetscFunctionReturn(0);
1314   }
1315 
1316   for (i=0; i<cm; i++) {
1317     anzi = ai[i+1] - ai[i];
1318     acol = aj + ai[i];
1319     aval = aa + ai[i];
1320     cnzi = ci[i+1] - ci[i];
1321     ccol = cj + ci[i];
1322     cval = ca + ci[i];
1323     for (j=0; j<cnzi; j++) {
1324       brow = ccol[j];
1325       bnzj = bi[brow+1] - bi[brow];
1326       bcol = bj + bi[brow];
1327       bval = ba + bi[brow];
1328 
1329       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1330       nexta = 0; nextb = 0;
1331       while (nexta<anzi && nextb<bnzj) {
1332         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1333         if (nexta == anzi) break;
1334         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1335         if (nextb == bnzj) break;
1336         if (acol[nexta] == bcol[nextb]) {
1337           cval[j] += aval[nexta]*bval[nextb];
1338           nexta++; nextb++;
1339           flops += 2;
1340         }
1341       }
1342     }
1343   }
1344   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1345   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1346   PetscCall(PetscLogFlops(flops));
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1351 {
1352   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1353 
1354   PetscFunctionBegin;
1355   PetscCall(MatDestroy(&atb->At));
1356   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1357   PetscCall(PetscFree(atb));
1358   PetscFunctionReturn(0);
1359 }
1360 
1361 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1362 {
1363   Mat            At = NULL;
1364   Mat_Product    *product = C->product;
1365   PetscBool      flg,def,square;
1366 
1367   PetscFunctionBegin;
1368   MatCheckProduct(C,4);
1369   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1370   /* outerproduct */
1371   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1372   if (flg) {
1373     /* create symbolic At */
1374     if (!square) {
1375       PetscCall(MatTransposeSymbolic(A,&At));
1376       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1377       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1378     }
1379     /* get symbolic C=At*B */
1380     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1381     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1382 
1383     /* clean up */
1384     if (!square) {
1385       PetscCall(MatDestroy(&At));
1386     }
1387 
1388     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1389     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1390     PetscFunctionReturn(0);
1391   }
1392 
1393   /* matmatmult */
1394   PetscCall(PetscStrcmp(product->alg,"default",&def));
1395   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1396   if (flg || def) {
1397     Mat_MatTransMatMult *atb;
1398 
1399     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1400     PetscCall(PetscNew(&atb));
1401     if (!square) {
1402       PetscCall(MatTranspose(A,MAT_INITIAL_MATRIX,&At));
1403     }
1404     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1405     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1406     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1407     product->data    = atb;
1408     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1409     atb->At          = At;
1410 
1411     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1412     PetscFunctionReturn(0);
1413   }
1414 
1415   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1416 }
1417 
1418 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1419 {
1420   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1421   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1422   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1423   PetscLogDouble flops=0.0;
1424   MatScalar      *aa=a->a,*ba,*ca,*caj;
1425 
1426   PetscFunctionBegin;
1427   if (!c->a) {
1428     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1429 
1430     c->a      = ca;
1431     c->free_a = PETSC_TRUE;
1432   } else {
1433     ca   = c->a;
1434     PetscCall(PetscArrayzero(ca,ci[cm]));
1435   }
1436 
1437   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1438   for (i=0; i<am; i++) {
1439     bj   = b->j + bi[i];
1440     ba   = b->a + bi[i];
1441     bnzi = bi[i+1] - bi[i];
1442     anzi = ai[i+1] - ai[i];
1443     for (j=0; j<anzi; j++) {
1444       nextb = 0;
1445       crow  = *aj++;
1446       cjj   = cj + ci[crow];
1447       caj   = ca + ci[crow];
1448       /* perform sparse axpy operation.  Note cjj includes bj. */
1449       for (k=0; nextb<bnzi; k++) {
1450         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1451           caj[k] += (*aa)*(*(ba+nextb));
1452           nextb++;
1453         }
1454       }
1455       flops += 2*bnzi;
1456       aa++;
1457     }
1458   }
1459 
1460   /* Assemble the final matrix and clean up */
1461   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1462   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1463   PetscCall(PetscLogFlops(flops));
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1468 {
1469   PetscFunctionBegin;
1470   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1471   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1472   PetscFunctionReturn(0);
1473 }
1474 
1475 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1476 {
1477   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1478   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1479   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1480   const PetscInt    *aj;
1481   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1482   PetscInt          clda;
1483   PetscInt          am4,bm4,col,i,j,n;
1484 
1485   PetscFunctionBegin;
1486   if (!cm || !cn) PetscFunctionReturn(0);
1487   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1488   if (add) {
1489     PetscCall(MatDenseGetArray(C,&c));
1490   } else {
1491     PetscCall(MatDenseGetArrayWrite(C,&c));
1492   }
1493   PetscCall(MatDenseGetArrayRead(B,&b));
1494   PetscCall(MatDenseGetLDA(B,&bm));
1495   PetscCall(MatDenseGetLDA(C,&clda));
1496   am4 = 4*clda;
1497   bm4 = 4*bm;
1498   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1499   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1500   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1501     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1502       r1 = r2 = r3 = r4 = 0.0;
1503       n  = a->i[i+1] - a->i[i];
1504       aj = a->j + a->i[i];
1505       aa = av + a->i[i];
1506       for (j=0; j<n; j++) {
1507         const PetscScalar aatmp = aa[j];
1508         const PetscInt    ajtmp = aj[j];
1509         r1 += aatmp*b1[ajtmp];
1510         r2 += aatmp*b2[ajtmp];
1511         r3 += aatmp*b3[ajtmp];
1512         r4 += aatmp*b4[ajtmp];
1513       }
1514       if (add) {
1515         c1[i] += r1;
1516         c2[i] += r2;
1517         c3[i] += r3;
1518         c4[i] += r4;
1519       } else {
1520         c1[i] = r1;
1521         c2[i] = r2;
1522         c3[i] = r3;
1523         c4[i] = r4;
1524       }
1525     }
1526     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1527     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1528   }
1529   /* process remaining columns */
1530   if (col != cn) {
1531     PetscInt rc = cn-col;
1532 
1533     if (rc == 1) {
1534       for (i=0; i<am; i++) {
1535         r1 = 0.0;
1536         n  = a->i[i+1] - a->i[i];
1537         aj = a->j + a->i[i];
1538         aa = av + a->i[i];
1539         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1540         if (add) c1[i] += r1;
1541         else c1[i] = r1;
1542       }
1543     } else if (rc == 2) {
1544       for (i=0; i<am; i++) {
1545         r1 = r2 = 0.0;
1546         n  = a->i[i+1] - a->i[i];
1547         aj = a->j + a->i[i];
1548         aa = av + a->i[i];
1549         for (j=0; j<n; j++) {
1550           const PetscScalar aatmp = aa[j];
1551           const PetscInt    ajtmp = aj[j];
1552           r1 += aatmp*b1[ajtmp];
1553           r2 += aatmp*b2[ajtmp];
1554         }
1555         if (add) {
1556           c1[i] += r1;
1557           c2[i] += r2;
1558         } else {
1559           c1[i] = r1;
1560           c2[i] = r2;
1561         }
1562       }
1563     } else {
1564       for (i=0; i<am; i++) {
1565         r1 = r2 = r3 = 0.0;
1566         n  = a->i[i+1] - a->i[i];
1567         aj = a->j + a->i[i];
1568         aa = av + a->i[i];
1569         for (j=0; j<n; j++) {
1570           const PetscScalar aatmp = aa[j];
1571           const PetscInt    ajtmp = aj[j];
1572           r1 += aatmp*b1[ajtmp];
1573           r2 += aatmp*b2[ajtmp];
1574           r3 += aatmp*b3[ajtmp];
1575         }
1576         if (add) {
1577           c1[i] += r1;
1578           c2[i] += r2;
1579           c3[i] += r3;
1580         } else {
1581           c1[i] = r1;
1582           c2[i] = r2;
1583           c3[i] = r3;
1584         }
1585       }
1586     }
1587   }
1588   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1589   if (add) {
1590     PetscCall(MatDenseRestoreArray(C,&c));
1591   } else {
1592     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1593   }
1594   PetscCall(MatDenseRestoreArrayRead(B,&b));
1595   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1600 {
1601   PetscFunctionBegin;
1602   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1603   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1604   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1605 
1606   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 /* ------------------------------------------------------- */
1611 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1612 {
1613   PetscFunctionBegin;
1614   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1615   C->ops->productsymbolic = MatProductSymbolic_AB;
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1620 
1621 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1622 {
1623   PetscFunctionBegin;
1624   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1625   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1630 {
1631   PetscFunctionBegin;
1632   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1633   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1638 {
1639   Mat_Product    *product = C->product;
1640 
1641   PetscFunctionBegin;
1642   switch (product->type) {
1643   case MATPRODUCT_AB:
1644     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1645     break;
1646   case MATPRODUCT_AtB:
1647     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1648     break;
1649   case MATPRODUCT_ABt:
1650     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1651     break;
1652   default:
1653     break;
1654   }
1655   PetscFunctionReturn(0);
1656 }
1657 /* ------------------------------------------------------- */
1658 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1659 {
1660   Mat_Product    *product = C->product;
1661   Mat            A = product->A;
1662   PetscBool      baij;
1663 
1664   PetscFunctionBegin;
1665   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1666   if (!baij) { /* A is seqsbaij */
1667     PetscBool sbaij;
1668     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1669     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1670 
1671     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1672   } else { /* A is seqbaij */
1673     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1674   }
1675 
1676   C->ops->productsymbolic = MatProductSymbolic_AB;
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1681 {
1682   Mat_Product    *product = C->product;
1683 
1684   PetscFunctionBegin;
1685   MatCheckProduct(C,1);
1686   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1687   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 /* ------------------------------------------------------- */
1692 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1693 {
1694   PetscFunctionBegin;
1695   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1696   C->ops->productsymbolic = MatProductSymbolic_AB;
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1701 {
1702   Mat_Product    *product = C->product;
1703 
1704   PetscFunctionBegin;
1705   if (product->type == MATPRODUCT_AB) {
1706     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1707   }
1708   PetscFunctionReturn(0);
1709 }
1710 /* ------------------------------------------------------- */
1711 
1712 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1713 {
1714   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1715   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1716   PetscInt       *bi      = b->i,*bj=b->j;
1717   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1718   MatScalar      *btval,*btval_den,*ba=b->a;
1719   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1720 
1721   PetscFunctionBegin;
1722   btval_den=btdense->v;
1723   PetscCall(PetscArrayzero(btval_den,m*n));
1724   for (k=0; k<ncolors; k++) {
1725     ncolumns = coloring->ncolumns[k];
1726     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1727       col   = *(columns + colorforcol[k] + l);
1728       btcol = bj + bi[col];
1729       btval = ba + bi[col];
1730       anz   = bi[col+1] - bi[col];
1731       for (j=0; j<anz; j++) {
1732         brow            = btcol[j];
1733         btval_den[brow] = btval[j];
1734       }
1735     }
1736     btval_den += m;
1737   }
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1742 {
1743   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1744   const PetscScalar *ca_den,*ca_den_ptr;
1745   PetscScalar       *ca=csp->a;
1746   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1747   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1748   PetscInt          nrows,*row,*idx;
1749   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1750 
1751   PetscFunctionBegin;
1752   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1753 
1754   if (brows > 0) {
1755     PetscInt *lstart,row_end,row_start;
1756     lstart = matcoloring->lstart;
1757     PetscCall(PetscArrayzero(lstart,ncolors));
1758 
1759     row_end = brows;
1760     if (row_end > m) row_end = m;
1761     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1762       ca_den_ptr = ca_den;
1763       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1764         nrows = matcoloring->nrows[k];
1765         row   = rows  + colorforrow[k];
1766         idx   = den2sp + colorforrow[k];
1767         for (l=lstart[k]; l<nrows; l++) {
1768           if (row[l] >= row_end) {
1769             lstart[k] = l;
1770             break;
1771           } else {
1772             ca[idx[l]] = ca_den_ptr[row[l]];
1773           }
1774         }
1775         ca_den_ptr += m;
1776       }
1777       row_end += brows;
1778       if (row_end > m) row_end = m;
1779     }
1780   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1781     ca_den_ptr = ca_den;
1782     for (k=0; k<ncolors; k++) {
1783       nrows = matcoloring->nrows[k];
1784       row   = rows  + colorforrow[k];
1785       idx   = den2sp + colorforrow[k];
1786       for (l=0; l<nrows; l++) {
1787         ca[idx[l]] = ca_den_ptr[row[l]];
1788       }
1789       ca_den_ptr += m;
1790     }
1791   }
1792 
1793   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1794 #if defined(PETSC_USE_INFO)
1795   if (matcoloring->brows > 0) {
1796     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1797   } else {
1798     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1799   }
1800 #endif
1801   PetscFunctionReturn(0);
1802 }
1803 
1804 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1805 {
1806   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1807   const PetscInt *is,*ci,*cj,*row_idx;
1808   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1809   IS             *isa;
1810   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1811   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1812   PetscInt       *colorforcol,*columns,*columns_i,brows;
1813   PetscBool      flg;
1814 
1815   PetscFunctionBegin;
1816   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1817 
1818   /* bs >1 is not being tested yet! */
1819   Nbs       = mat->cmap->N/bs;
1820   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1821   c->N      = Nbs;
1822   c->m      = c->M;
1823   c->rstart = 0;
1824   c->brows  = 100;
1825 
1826   c->ncolors = nis;
1827   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1828   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1829   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1830 
1831   brows = c->brows;
1832   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1833   if (flg) c->brows = brows;
1834   if (brows > 0) {
1835     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1836   }
1837 
1838   colorforrow[0] = 0;
1839   rows_i         = rows;
1840   den2sp_i       = den2sp;
1841 
1842   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1843   PetscCall(PetscMalloc1(Nbs+1,&columns));
1844 
1845   colorforcol[0] = 0;
1846   columns_i      = columns;
1847 
1848   /* get column-wise storage of mat */
1849   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1850 
1851   cm   = c->m;
1852   PetscCall(PetscMalloc1(cm+1,&rowhit));
1853   PetscCall(PetscMalloc1(cm+1,&idxhit));
1854   for (i=0; i<nis; i++) { /* loop over color */
1855     PetscCall(ISGetLocalSize(isa[i],&n));
1856     PetscCall(ISGetIndices(isa[i],&is));
1857 
1858     c->ncolumns[i] = n;
1859     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1860     colorforcol[i+1] = colorforcol[i] + n;
1861     columns_i       += n;
1862 
1863     /* fast, crude version requires O(N*N) work */
1864     PetscCall(PetscArrayzero(rowhit,cm));
1865 
1866     for (j=0; j<n; j++) { /* loop over columns*/
1867       col     = is[j];
1868       row_idx = cj + ci[col];
1869       m       = ci[col+1] - ci[col];
1870       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1871         idxhit[*row_idx]   = spidx[ci[col] + k];
1872         rowhit[*row_idx++] = col + 1;
1873       }
1874     }
1875     /* count the number of hits */
1876     nrows = 0;
1877     for (j=0; j<cm; j++) {
1878       if (rowhit[j]) nrows++;
1879     }
1880     c->nrows[i]      = nrows;
1881     colorforrow[i+1] = colorforrow[i] + nrows;
1882 
1883     nrows = 0;
1884     for (j=0; j<cm; j++) { /* loop over rows */
1885       if (rowhit[j]) {
1886         rows_i[nrows]   = j;
1887         den2sp_i[nrows] = idxhit[j];
1888         nrows++;
1889       }
1890     }
1891     den2sp_i += nrows;
1892 
1893     PetscCall(ISRestoreIndices(isa[i],&is));
1894     rows_i += nrows;
1895   }
1896   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1897   PetscCall(PetscFree(rowhit));
1898   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1899   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1900 
1901   c->colorforrow = colorforrow;
1902   c->rows        = rows;
1903   c->den2sp      = den2sp;
1904   c->colorforcol = colorforcol;
1905   c->columns     = columns;
1906 
1907   PetscCall(PetscFree(idxhit));
1908   PetscFunctionReturn(0);
1909 }
1910 
1911 /* --------------------------------------------------------------- */
1912 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1913 {
1914   Mat_Product    *product = C->product;
1915   Mat            A=product->A,B=product->B;
1916 
1917   PetscFunctionBegin;
1918   if (C->ops->mattransposemultnumeric) {
1919     /* Alg: "outerproduct" */
1920     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1921   } else {
1922     /* Alg: "matmatmult" -- C = At*B */
1923     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1924 
1925     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1926     if (atb->At) {
1927       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1928          user may have called MatProductReplaceMats() to get this A=product->A */
1929       PetscCall(MatTransposeSetPrecursor(A,atb->At));
1930       PetscCall(MatTranspose(A,MAT_REUSE_MATRIX,&atb->At));
1931     }
1932     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A,B,C));
1933   }
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1938 {
1939   Mat_Product    *product = C->product;
1940   Mat            A=product->A,B=product->B;
1941   PetscReal      fill=product->fill;
1942 
1943   PetscFunctionBegin;
1944   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1945 
1946   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 /* --------------------------------------------------------------- */
1951 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1952 {
1953   Mat_Product    *product = C->product;
1954   PetscInt       alg = 0; /* default algorithm */
1955   PetscBool      flg = PETSC_FALSE;
1956 #if !defined(PETSC_HAVE_HYPRE)
1957   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1958   PetscInt       nalg = 7;
1959 #else
1960   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1961   PetscInt       nalg = 8;
1962 #endif
1963 
1964   PetscFunctionBegin;
1965   /* Set default algorithm */
1966   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1967   if (flg) {
1968     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1969   }
1970 
1971   /* Get runtime option */
1972   if (product->api_user) {
1973     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1974     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1975     PetscOptionsEnd();
1976   } else {
1977     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1978     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1979     PetscOptionsEnd();
1980   }
1981   if (flg) {
1982     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1983   }
1984 
1985   C->ops->productsymbolic = MatProductSymbolic_AB;
1986   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1987   PetscFunctionReturn(0);
1988 }
1989 
1990 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1991 {
1992   Mat_Product    *product = C->product;
1993   PetscInt       alg = 0; /* default algorithm */
1994   PetscBool      flg = PETSC_FALSE;
1995   const char     *algTypes[3] = {"default","at*b","outerproduct"};
1996   PetscInt       nalg = 3;
1997 
1998   PetscFunctionBegin;
1999   /* Get runtime option */
2000   if (product->api_user) {
2001     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2002     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2003     PetscOptionsEnd();
2004   } else {
2005     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2006     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2007     PetscOptionsEnd();
2008   }
2009   if (flg) {
2010     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2011   }
2012 
2013   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2014   PetscFunctionReturn(0);
2015 }
2016 
2017 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2018 {
2019   Mat_Product    *product = C->product;
2020   PetscInt       alg = 0; /* default algorithm */
2021   PetscBool      flg = PETSC_FALSE;
2022   const char     *algTypes[2] = {"default","color"};
2023   PetscInt       nalg = 2;
2024 
2025   PetscFunctionBegin;
2026   /* Set default algorithm */
2027   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2028   if (!flg) {
2029     alg = 1;
2030     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2031   }
2032 
2033   /* Get runtime option */
2034   if (product->api_user) {
2035     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2036     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2037     PetscOptionsEnd();
2038   } else {
2039     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2040     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2041     PetscOptionsEnd();
2042   }
2043   if (flg) {
2044     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2045   }
2046 
2047   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2048   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2053 {
2054   Mat_Product    *product = C->product;
2055   PetscBool      flg = PETSC_FALSE;
2056   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2057 #if !defined(PETSC_HAVE_HYPRE)
2058   const char     *algTypes[2] = {"scalable","rap"};
2059   PetscInt       nalg = 2;
2060 #else
2061   const char     *algTypes[3] = {"scalable","rap","hypre"};
2062   PetscInt       nalg = 3;
2063 #endif
2064 
2065   PetscFunctionBegin;
2066   /* Set default algorithm */
2067   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2068   if (flg) {
2069     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2070   }
2071 
2072   /* Get runtime option */
2073   if (product->api_user) {
2074     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2075     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2076     PetscOptionsEnd();
2077   } else {
2078     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2079     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2080     PetscOptionsEnd();
2081   }
2082   if (flg) {
2083     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2084   }
2085 
2086   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2087   PetscFunctionReturn(0);
2088 }
2089 
2090 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2091 {
2092   Mat_Product    *product = C->product;
2093   PetscBool      flg = PETSC_FALSE;
2094   PetscInt       alg = 0; /* default algorithm */
2095   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2096   PetscInt        nalg = 3;
2097 
2098   PetscFunctionBegin;
2099   /* Set default algorithm */
2100   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2101   if (flg) {
2102     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2103   }
2104 
2105   /* Get runtime option */
2106   if (product->api_user) {
2107     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2108     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2109     PetscOptionsEnd();
2110   } else {
2111     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2112     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2113     PetscOptionsEnd();
2114   }
2115   if (flg) {
2116     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2117   }
2118 
2119   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2120   PetscFunctionReturn(0);
2121 }
2122 
2123 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2124 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2125 {
2126   Mat_Product    *product = C->product;
2127   PetscInt       alg = 0; /* default algorithm */
2128   PetscBool      flg = PETSC_FALSE;
2129   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2130   PetscInt       nalg = 7;
2131 
2132   PetscFunctionBegin;
2133   /* Set default algorithm */
2134   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2135   if (flg) {
2136     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2137   }
2138 
2139   /* Get runtime option */
2140   if (product->api_user) {
2141     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2142     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2143     PetscOptionsEnd();
2144   } else {
2145     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2146     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2147     PetscOptionsEnd();
2148   }
2149   if (flg) {
2150     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2151   }
2152 
2153   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2154   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2155   PetscFunctionReturn(0);
2156 }
2157 
2158 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2159 {
2160   Mat_Product    *product = C->product;
2161 
2162   PetscFunctionBegin;
2163   switch (product->type) {
2164   case MATPRODUCT_AB:
2165     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2166     break;
2167   case MATPRODUCT_AtB:
2168     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2169     break;
2170   case MATPRODUCT_ABt:
2171     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2172     break;
2173   case MATPRODUCT_PtAP:
2174     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2175     break;
2176   case MATPRODUCT_RARt:
2177     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2178     break;
2179   case MATPRODUCT_ABC:
2180     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2181     break;
2182   default:
2183     break;
2184   }
2185   PetscFunctionReturn(0);
2186 }
2187