xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision ebead697dbf761eb322f829370bbe90b3bd93fa3)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) {
17     PetscCheck(C->ops->matmultnumeric != MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
18     PetscCall((*C->ops->matmultnumeric)(A,B,C));
19   } else {
20     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
21   }
22   PetscFunctionReturn(0);
23 }
24 
25 /* Modified from MatCreateSeqAIJWithArrays() */
26 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
27 {
28   PetscInt       ii;
29   Mat_SeqAIJ     *aij;
30   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
31 
32   PetscFunctionBegin;
33   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
34   PetscCall(MatSetSizes(mat,m,n,m,n));
35 
36   if (!mtype) {
37     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
38     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
39   } else {
40     PetscCall(MatSetType(mat,mtype));
41   }
42 
43   aij  = (Mat_SeqAIJ*)(mat)->data;
44   osingle = aij->singlemalloc;
45   ofree_a = aij->free_a;
46   ofree_ij = aij->free_ij;
47   /* changes the free flags */
48   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
49 
50   PetscCall(PetscFree(aij->ilen));
51   PetscCall(PetscFree(aij->imax));
52   PetscCall(PetscMalloc1(m,&aij->imax));
53   PetscCall(PetscMalloc1(m,&aij->ilen));
54   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
55     const PetscInt rnz = i[ii+1] - i[ii];
56     aij->nonzerorowcnt += !!rnz;
57     aij->rmax = PetscMax(aij->rmax,rnz);
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60   aij->maxnz = i[m];
61   aij->nz = i[m];
62 
63   if (osingle) {
64     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
65   } else {
66     if (ofree_a)  PetscCall(PetscFree(aij->a));
67     if (ofree_ij) PetscCall(PetscFree(aij->j));
68     if (ofree_ij) PetscCall(PetscFree(aij->i));
69   }
70   aij->i            = i;
71   aij->j            = j;
72   aij->a            = a;
73   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
74   /* default to not retain ownership */
75   aij->singlemalloc = PETSC_FALSE;
76   aij->free_a       = PETSC_FALSE;
77   aij->free_ij      = PETSC_FALSE;
78   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
79   PetscFunctionReturn(0);
80 }
81 
82 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
83 {
84   Mat_Product         *product = C->product;
85   MatProductAlgorithm alg;
86   PetscBool           flg;
87 
88   PetscFunctionBegin;
89   if (product) {
90     alg = product->alg;
91   } else {
92     alg = "sorted";
93   }
94   /* sorted */
95   PetscCall(PetscStrcmp(alg,"sorted",&flg));
96   if (flg) {
97     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
98     PetscFunctionReturn(0);
99   }
100 
101   /* scalable */
102   PetscCall(PetscStrcmp(alg,"scalable",&flg));
103   if (flg) {
104     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
105     PetscFunctionReturn(0);
106   }
107 
108   /* scalable_fast */
109   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
110   if (flg) {
111     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
112     PetscFunctionReturn(0);
113   }
114 
115   /* heap */
116   PetscCall(PetscStrcmp(alg,"heap",&flg));
117   if (flg) {
118     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
119     PetscFunctionReturn(0);
120   }
121 
122   /* btheap */
123   PetscCall(PetscStrcmp(alg,"btheap",&flg));
124   if (flg) {
125     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
126     PetscFunctionReturn(0);
127   }
128 
129   /* llcondensed */
130   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
131   if (flg) {
132     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
133     PetscFunctionReturn(0);
134   }
135 
136   /* rowmerge */
137   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
138   if (flg) {
139     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
140     PetscFunctionReturn(0);
141   }
142 
143 #if defined(PETSC_HAVE_HYPRE)
144   PetscCall(PetscStrcmp(alg,"hypre",&flg));
145   if (flg) {
146     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
147     PetscFunctionReturn(0);
148   }
149 #endif
150 
151   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
152 }
153 
154 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
155 {
156   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
157   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
158   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
159   PetscReal          afill;
160   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
161   PetscTable         ta;
162   PetscBT            lnkbt;
163   PetscFreeSpaceList free_space=NULL,current_space=NULL;
164 
165   PetscFunctionBegin;
166   /* Get ci and cj */
167   /*---------------*/
168   /* Allocate ci array, arrays for fill computation and */
169   /* free space for accumulating nonzero column info */
170   PetscCall(PetscMalloc1(am+2,&ci));
171   ci[0] = 0;
172 
173   /* create and initialize a linked list */
174   PetscCall(PetscTableCreate(bn,bn,&ta));
175   MatRowMergeMax_SeqAIJ(b,bm,ta);
176   PetscCall(PetscTableGetCount(ta,&Crmax));
177   PetscCall(PetscTableDestroy(&ta));
178 
179   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
180 
181   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
182   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
183 
184   current_space = free_space;
185 
186   /* Determine ci and cj */
187   for (i=0; i<am; i++) {
188     anzi = ai[i+1] - ai[i];
189     aj   = a->j + ai[i];
190     for (j=0; j<anzi; j++) {
191       brow = aj[j];
192       bnzj = bi[brow+1] - bi[brow];
193       bj   = b->j + bi[brow];
194       /* add non-zero cols of B into the sorted linked list lnk */
195       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
196     }
197     /* add possible missing diagonal entry */
198     if (C->force_diagonals) {
199       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
200     }
201     cnzi = lnk[0];
202 
203     /* If free space is not available, make more free space */
204     /* Double the amount of total space in the list */
205     if (current_space->local_remaining<cnzi) {
206       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
207       ndouble++;
208     }
209 
210     /* Copy data into free space, then initialize lnk */
211     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
212 
213     current_space->array           += cnzi;
214     current_space->local_used      += cnzi;
215     current_space->local_remaining -= cnzi;
216 
217     ci[i+1] = ci[i] + cnzi;
218   }
219 
220   /* Column indices are in the list of free space */
221   /* Allocate space for cj, initialize cj, and */
222   /* destroy list of free space and other temporary array(s) */
223   PetscCall(PetscMalloc1(ci[am]+1,&cj));
224   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
225   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
226 
227   /* put together the new symbolic matrix */
228   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
229   PetscCall(MatSetBlockSizesFromMats(C,A,B));
230 
231   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
232   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
233   c          = (Mat_SeqAIJ*)(C->data);
234   c->free_a  = PETSC_FALSE;
235   c->free_ij = PETSC_TRUE;
236   c->nonew   = 0;
237 
238   /* fast, needs non-scalable O(bn) array 'abdense' */
239   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
240 
241   /* set MatInfo */
242   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
243   if (afill < 1.0) afill = 1.0;
244   C->info.mallocs           = ndouble;
245   C->info.fill_ratio_given  = fill;
246   C->info.fill_ratio_needed = afill;
247 
248 #if defined(PETSC_USE_INFO)
249   if (ci[am]) {
250     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
251     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
252   } else {
253     PetscCall(PetscInfo(C,"Empty matrix product\n"));
254   }
255 #endif
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
260 {
261   PetscLogDouble    flops=0.0;
262   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
263   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
264   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
265   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
266   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
267   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
268   PetscScalar       *ca,valtmp;
269   PetscScalar       *ab_dense;
270   PetscContainer    cab_dense;
271   const PetscScalar *aa,*ba,*baj;
272 
273   PetscFunctionBegin;
274   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
275   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
276   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
277     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
278     c->a      = ca;
279     c->free_a = PETSC_TRUE;
280   } else ca = c->a;
281 
282   /* TODO this should be done in the symbolic phase */
283   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
284      that is hard to eradicate) */
285   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
286   if (!cab_dense) {
287     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
288     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
289     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
290     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
291     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
292     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
293   }
294   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
295   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
296 
297   /* clean old values in C */
298   PetscCall(PetscArrayzero(ca,ci[cm]));
299   /* Traverse A row-wise. */
300   /* Build the ith row in C by summing over nonzero columns in A, */
301   /* the rows of B corresponding to nonzeros of A. */
302   for (i=0; i<am; i++) {
303     anzi = ai[i+1] - ai[i];
304     for (j=0; j<anzi; j++) {
305       brow = aj[j];
306       bnzi = bi[brow+1] - bi[brow];
307       bjj  = bj + bi[brow];
308       baj  = ba + bi[brow];
309       /* perform dense axpy */
310       valtmp = aa[j];
311       for (k=0; k<bnzi; k++) {
312         ab_dense[bjj[k]] += valtmp*baj[k];
313       }
314       flops += 2*bnzi;
315     }
316     aj += anzi; aa += anzi;
317 
318     cnzi = ci[i+1] - ci[i];
319     for (k=0; k<cnzi; k++) {
320       ca[k]          += ab_dense[cj[k]];
321       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
322     }
323     flops += cnzi;
324     cj    += cnzi; ca += cnzi;
325   }
326 #if defined(PETSC_HAVE_DEVICE)
327   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
328 #endif
329   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
330   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
331   PetscCall(PetscLogFlops(flops));
332   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
333   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
334   PetscFunctionReturn(0);
335 }
336 
337 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
338 {
339   PetscLogDouble    flops=0.0;
340   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
341   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
342   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
343   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
344   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
345   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
346   PetscScalar       *ca=c->a,valtmp;
347   const PetscScalar *aa,*ba,*baj;
348   PetscInt          nextb;
349 
350   PetscFunctionBegin;
351   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
352   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
353   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
354     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
355     c->a      = ca;
356     c->free_a = PETSC_TRUE;
357   }
358 
359   /* clean old values in C */
360   PetscCall(PetscArrayzero(ca,ci[cm]));
361   /* Traverse A row-wise. */
362   /* Build the ith row in C by summing over nonzero columns in A, */
363   /* the rows of B corresponding to nonzeros of A. */
364   for (i=0; i<am; i++) {
365     anzi = ai[i+1] - ai[i];
366     cnzi = ci[i+1] - ci[i];
367     for (j=0; j<anzi; j++) {
368       brow = aj[j];
369       bnzi = bi[brow+1] - bi[brow];
370       bjj  = bj + bi[brow];
371       baj  = ba + bi[brow];
372       /* perform sparse axpy */
373       valtmp = aa[j];
374       nextb  = 0;
375       for (k=0; nextb<bnzi; k++) {
376         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
377           ca[k] += valtmp*baj[nextb++];
378         }
379       }
380       flops += 2*bnzi;
381     }
382     aj += anzi; aa += anzi;
383     cj += cnzi; ca += cnzi;
384   }
385 #if defined(PETSC_HAVE_DEVICE)
386   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
387 #endif
388   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
389   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
390   PetscCall(PetscLogFlops(flops));
391   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
392   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
393   PetscFunctionReturn(0);
394 }
395 
396 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
397 {
398   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
399   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
400   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
401   MatScalar          *ca;
402   PetscReal          afill;
403   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
404   PetscTable         ta;
405   PetscFreeSpaceList free_space=NULL,current_space=NULL;
406 
407   PetscFunctionBegin;
408   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
409   /*-----------------------------------------------------------------------------------------*/
410   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
411   PetscCall(PetscMalloc1(am+2,&ci));
412   ci[0] = 0;
413 
414   /* create and initialize a linked list */
415   PetscCall(PetscTableCreate(bn,bn,&ta));
416   MatRowMergeMax_SeqAIJ(b,bm,ta);
417   PetscCall(PetscTableGetCount(ta,&Crmax));
418   PetscCall(PetscTableDestroy(&ta));
419 
420   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
421 
422   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
423   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
424   current_space = free_space;
425 
426   /* Determine ci and cj */
427   for (i=0; i<am; i++) {
428     anzi = ai[i+1] - ai[i];
429     aj   = a->j + ai[i];
430     for (j=0; j<anzi; j++) {
431       brow = aj[j];
432       bnzj = bi[brow+1] - bi[brow];
433       bj   = b->j + bi[brow];
434       /* add non-zero cols of B into the sorted linked list lnk */
435       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
436     }
437     /* add possible missing diagonal entry */
438     if (C->force_diagonals) {
439       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
440     }
441     cnzi = lnk[1];
442 
443     /* If free space is not available, make more free space */
444     /* Double the amount of total space in the list */
445     if (current_space->local_remaining<cnzi) {
446       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
447       ndouble++;
448     }
449 
450     /* Copy data into free space, then initialize lnk */
451     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
452 
453     current_space->array           += cnzi;
454     current_space->local_used      += cnzi;
455     current_space->local_remaining -= cnzi;
456 
457     ci[i+1] = ci[i] + cnzi;
458   }
459 
460   /* Column indices are in the list of free space */
461   /* Allocate space for cj, initialize cj, and */
462   /* destroy list of free space and other temporary array(s) */
463   PetscCall(PetscMalloc1(ci[am]+1,&cj));
464   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
465   PetscCall(PetscLLCondensedDestroy_fast(lnk));
466 
467   /* Allocate space for ca */
468   PetscCall(PetscCalloc1(ci[am]+1,&ca));
469 
470   /* put together the new symbolic matrix */
471   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
472   PetscCall(MatSetBlockSizesFromMats(C,A,B));
473 
474   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
475   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
476   c          = (Mat_SeqAIJ*)(C->data);
477   c->free_a  = PETSC_TRUE;
478   c->free_ij = PETSC_TRUE;
479   c->nonew   = 0;
480 
481   /* slower, less memory */
482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
483 
484   /* set MatInfo */
485   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
486   if (afill < 1.0) afill = 1.0;
487   C->info.mallocs           = ndouble;
488   C->info.fill_ratio_given  = fill;
489   C->info.fill_ratio_needed = afill;
490 
491 #if defined(PETSC_USE_INFO)
492   if (ci[am]) {
493     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
494     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
495   } else {
496     PetscCall(PetscInfo(C,"Empty matrix product\n"));
497   }
498 #endif
499   PetscFunctionReturn(0);
500 }
501 
502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
503 {
504   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
505   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
506   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
507   MatScalar          *ca;
508   PetscReal          afill;
509   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
510   PetscTable         ta;
511   PetscFreeSpaceList free_space=NULL,current_space=NULL;
512 
513   PetscFunctionBegin;
514   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
515   /*---------------------------------------------------------------------------------------------*/
516   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
517   PetscCall(PetscMalloc1(am+2,&ci));
518   ci[0] = 0;
519 
520   /* create and initialize a linked list */
521   PetscCall(PetscTableCreate(bn,bn,&ta));
522   MatRowMergeMax_SeqAIJ(b,bm,ta);
523   PetscCall(PetscTableGetCount(ta,&Crmax));
524   PetscCall(PetscTableDestroy(&ta));
525   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
526 
527   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
528   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
529   current_space = free_space;
530 
531   /* Determine ci and cj */
532   for (i=0; i<am; i++) {
533     anzi = ai[i+1] - ai[i];
534     aj   = a->j + ai[i];
535     for (j=0; j<anzi; j++) {
536       brow = aj[j];
537       bnzj = bi[brow+1] - bi[brow];
538       bj   = b->j + bi[brow];
539       /* add non-zero cols of B into the sorted linked list lnk */
540       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
541     }
542     /* add possible missing diagonal entry */
543     if (C->force_diagonals) {
544       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
545     }
546 
547     cnzi = lnk[0];
548 
549     /* If free space is not available, make more free space */
550     /* Double the amount of total space in the list */
551     if (current_space->local_remaining<cnzi) {
552       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
553       ndouble++;
554     }
555 
556     /* Copy data into free space, then initialize lnk */
557     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
558 
559     current_space->array           += cnzi;
560     current_space->local_used      += cnzi;
561     current_space->local_remaining -= cnzi;
562 
563     ci[i+1] = ci[i] + cnzi;
564   }
565 
566   /* Column indices are in the list of free space */
567   /* Allocate space for cj, initialize cj, and */
568   /* destroy list of free space and other temporary array(s) */
569   PetscCall(PetscMalloc1(ci[am]+1,&cj));
570   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
571   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
572 
573   /* Allocate space for ca */
574   /*-----------------------*/
575   PetscCall(PetscCalloc1(ci[am]+1,&ca));
576 
577   /* put together the new symbolic matrix */
578   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
579   PetscCall(MatSetBlockSizesFromMats(C,A,B));
580 
581   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
582   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
583   c          = (Mat_SeqAIJ*)(C->data);
584   c->free_a  = PETSC_TRUE;
585   c->free_ij = PETSC_TRUE;
586   c->nonew   = 0;
587 
588   /* slower, less memory */
589   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
590 
591   /* set MatInfo */
592   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
593   if (afill < 1.0) afill = 1.0;
594   C->info.mallocs           = ndouble;
595   C->info.fill_ratio_given  = fill;
596   C->info.fill_ratio_needed = afill;
597 
598 #if defined(PETSC_USE_INFO)
599   if (ci[am]) {
600     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
601     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
602   } else {
603     PetscCall(PetscInfo(C,"Empty matrix product\n"));
604   }
605 #endif
606   PetscFunctionReturn(0);
607 }
608 
609 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
610 {
611   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
612   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
613   PetscInt           *ci,*cj,*bb;
614   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
615   PetscReal          afill;
616   PetscInt           i,j,col,ndouble = 0;
617   PetscFreeSpaceList free_space=NULL,current_space=NULL;
618   PetscHeap          h;
619 
620   PetscFunctionBegin;
621   /* Get ci and cj - by merging sorted rows using a heap */
622   /*---------------------------------------------------------------------------------------------*/
623   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
624   PetscCall(PetscMalloc1(am+2,&ci));
625   ci[0] = 0;
626 
627   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
628   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
629   current_space = free_space;
630 
631   PetscCall(PetscHeapCreate(a->rmax,&h));
632   PetscCall(PetscMalloc1(a->rmax,&bb));
633 
634   /* Determine ci and cj */
635   for (i=0; i<am; i++) {
636     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
637     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
638     ci[i+1] = ci[i];
639     /* Populate the min heap */
640     for (j=0; j<anzi; j++) {
641       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
642       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
643         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
644       }
645     }
646     /* Pick off the min element, adding it to free space */
647     PetscCall(PetscHeapPop(h,&j,&col));
648     while (j >= 0) {
649       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
650         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
651         ndouble++;
652       }
653       *(current_space->array++) = col;
654       current_space->local_used++;
655       current_space->local_remaining--;
656       ci[i+1]++;
657 
658       /* stash if anything else remains in this row of B */
659       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
660       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
661         PetscInt j2,col2;
662         PetscCall(PetscHeapPeek(h,&j2,&col2));
663         if (col2 != col) break;
664         PetscCall(PetscHeapPop(h,&j2,&col2));
665         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
666       }
667       /* Put any stashed elements back into the min heap */
668       PetscCall(PetscHeapUnstash(h));
669       PetscCall(PetscHeapPop(h,&j,&col));
670     }
671   }
672   PetscCall(PetscFree(bb));
673   PetscCall(PetscHeapDestroy(&h));
674 
675   /* Column indices are in the list of free space */
676   /* Allocate space for cj, initialize cj, and */
677   /* destroy list of free space and other temporary array(s) */
678   PetscCall(PetscMalloc1(ci[am],&cj));
679   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
680 
681   /* put together the new symbolic matrix */
682   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
683   PetscCall(MatSetBlockSizesFromMats(C,A,B));
684 
685   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687   c          = (Mat_SeqAIJ*)(C->data);
688   c->free_a  = PETSC_TRUE;
689   c->free_ij = PETSC_TRUE;
690   c->nonew   = 0;
691 
692   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
693 
694   /* set MatInfo */
695   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
696   if (afill < 1.0) afill = 1.0;
697   C->info.mallocs           = ndouble;
698   C->info.fill_ratio_given  = fill;
699   C->info.fill_ratio_needed = afill;
700 
701 #if defined(PETSC_USE_INFO)
702   if (ci[am]) {
703     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
704     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
705   } else {
706     PetscCall(PetscInfo(C,"Empty matrix product\n"));
707   }
708 #endif
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
713 {
714   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
715   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
716   PetscInt           *ci,*cj,*bb;
717   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
718   PetscReal          afill;
719   PetscInt           i,j,col,ndouble = 0;
720   PetscFreeSpaceList free_space=NULL,current_space=NULL;
721   PetscHeap          h;
722   PetscBT            bt;
723 
724   PetscFunctionBegin;
725   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
726   /*---------------------------------------------------------------------------------------------*/
727   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
728   PetscCall(PetscMalloc1(am+2,&ci));
729   ci[0] = 0;
730 
731   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
732   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
733 
734   current_space = free_space;
735 
736   PetscCall(PetscHeapCreate(a->rmax,&h));
737   PetscCall(PetscMalloc1(a->rmax,&bb));
738   PetscCall(PetscBTCreate(bn,&bt));
739 
740   /* Determine ci and cj */
741   for (i=0; i<am; i++) {
742     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
743     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
744     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
745     ci[i+1] = ci[i];
746     /* Populate the min heap */
747     for (j=0; j<anzi; j++) {
748       PetscInt brow = acol[j];
749       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
750         PetscInt bcol = bj[bb[j]];
751         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
752           PetscCall(PetscHeapAdd(h,j,bcol));
753           bb[j]++;
754           break;
755         }
756       }
757     }
758     /* Pick off the min element, adding it to free space */
759     PetscCall(PetscHeapPop(h,&j,&col));
760     while (j >= 0) {
761       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
762         fptr = NULL;                      /* need PetscBTMemzero */
763         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
764         ndouble++;
765       }
766       *(current_space->array++) = col;
767       current_space->local_used++;
768       current_space->local_remaining--;
769       ci[i+1]++;
770 
771       /* stash if anything else remains in this row of B */
772       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
773         PetscInt bcol = bj[bb[j]];
774         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
775           PetscCall(PetscHeapAdd(h,j,bcol));
776           bb[j]++;
777           break;
778         }
779       }
780       PetscCall(PetscHeapPop(h,&j,&col));
781     }
782     if (fptr) {                 /* Clear the bits for this row */
783       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
784     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
785       PetscCall(PetscBTMemzero(bn,bt));
786     }
787   }
788   PetscCall(PetscFree(bb));
789   PetscCall(PetscHeapDestroy(&h));
790   PetscCall(PetscBTDestroy(&bt));
791 
792   /* Column indices are in the list of free space */
793   /* Allocate space for cj, initialize cj, and */
794   /* destroy list of free space and other temporary array(s) */
795   PetscCall(PetscMalloc1(ci[am],&cj));
796   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
797 
798   /* put together the new symbolic matrix */
799   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
800   PetscCall(MatSetBlockSizesFromMats(C,A,B));
801 
802   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
803   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
804   c          = (Mat_SeqAIJ*)(C->data);
805   c->free_a  = PETSC_TRUE;
806   c->free_ij = PETSC_TRUE;
807   c->nonew   = 0;
808 
809   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
810 
811   /* set MatInfo */
812   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
813   if (afill < 1.0) afill = 1.0;
814   C->info.mallocs           = ndouble;
815   C->info.fill_ratio_given  = fill;
816   C->info.fill_ratio_needed = afill;
817 
818 #if defined(PETSC_USE_INFO)
819   if (ci[am]) {
820     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
821     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
822   } else {
823     PetscCall(PetscInfo(C,"Empty matrix product\n"));
824   }
825 #endif
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
830 {
831   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
832   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
833   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
834   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
835   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
836   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
837   const PetscInt     *brow_ptr[8],*brow_end[8];
838   PetscInt           window[8];
839   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
840   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
841   PetscReal          afill;
842   PetscInt           *workj_L1,*workj_L2,*workj_L3;
843   PetscInt           L1_nnz,L2_nnz;
844 
845   /* Step 1: Get upper bound on memory required for allocation.
846              Because of the way virtual memory works,
847              only the memory pages that are actually needed will be physically allocated. */
848   PetscFunctionBegin;
849   PetscCall(PetscMalloc1(am+1,&ci));
850   for (i=0; i<am; i++) {
851     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
852     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
853     a_rownnz = 0;
854     for (k=0; k<anzi; ++k) {
855       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
856       if (a_rownnz > bn) {
857         a_rownnz = bn;
858         break;
859       }
860     }
861     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
862   }
863   /* temporary work areas for merging rows */
864   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
865   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
866   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
867 
868   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
869   c_maxmem = 8*(ai[am]+bi[bm]);
870   /* Step 2: Populate pattern for C */
871   PetscCall(PetscMalloc1(c_maxmem,&cj));
872 
873   ci_nnz       = 0;
874   ci[0]        = 0;
875   worki_L1[0]  = 0;
876   worki_L2[0]  = 0;
877   for (i=0; i<am; i++) {
878     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
879     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
880     rowsleft             = anzi;
881     inputcol_L1          = acol;
882     L2_nnz               = 0;
883     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
884     worki_L2[1]          = 0;
885     outputi_nnz          = 0;
886 
887     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
888     while (ci_nnz+a_maxrownnz > c_maxmem) {
889       c_maxmem *= 2;
890       ndouble++;
891       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
892     }
893 
894     while (rowsleft) {
895       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
896       L1_nrows    = 0;
897       L1_nnz      = 0;
898       inputcol    = inputcol_L1;
899       inputi      = bi;
900       inputj      = bj;
901 
902       /* The following macro is used to specialize for small rows in A.
903          This helps with compiler unrolling, improving performance substantially.
904           Input:  inputj   inputi  inputcol  bn
905           Output: outputj  outputi_nnz                       */
906        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
907          window_min  = bn;                                                   \
908          outputi_nnz = 0;                                                    \
909          for (k=0; k<ANNZ; ++k) {                                            \
910            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
911            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
912            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
913            window_min  = PetscMin(window[k], window_min);                    \
914          }                                                                   \
915          while (window_min < bn) {                                           \
916            outputj[outputi_nnz++] = window_min;                              \
917            /* advance front and compute new minimum */                       \
918            old_window_min = window_min;                                      \
919            window_min = bn;                                                  \
920            for (k=0; k<ANNZ; ++k) {                                          \
921              if (window[k] == old_window_min) {                              \
922                brow_ptr[k]++;                                                \
923                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
924              }                                                               \
925              window_min = PetscMin(window[k], window_min);                   \
926            }                                                                 \
927          }
928 
929       /************** L E V E L  1 ***************/
930       /* Merge up to 8 rows of B to L1 work array*/
931       while (L1_rowsleft) {
932         outputi_nnz = 0;
933         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
934         else           outputj = cj + ci_nnz;           /* Merge directly to C */
935 
936         switch (L1_rowsleft) {
937         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
938                  brow_end[0] = inputj + inputi[inputcol[0]+1];
939                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
940                  inputcol    += L1_rowsleft;
941                  rowsleft    -= L1_rowsleft;
942                  L1_rowsleft  = 0;
943                  break;
944         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
945                  inputcol    += L1_rowsleft;
946                  rowsleft    -= L1_rowsleft;
947                  L1_rowsleft  = 0;
948                  break;
949         case 3: MatMatMultSymbolic_RowMergeMacro(3);
950                  inputcol    += L1_rowsleft;
951                  rowsleft    -= L1_rowsleft;
952                  L1_rowsleft  = 0;
953                  break;
954         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
955                  inputcol    += L1_rowsleft;
956                  rowsleft    -= L1_rowsleft;
957                  L1_rowsleft  = 0;
958                  break;
959         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
960                  inputcol    += L1_rowsleft;
961                  rowsleft    -= L1_rowsleft;
962                  L1_rowsleft  = 0;
963                  break;
964         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
965                  inputcol    += L1_rowsleft;
966                  rowsleft    -= L1_rowsleft;
967                  L1_rowsleft  = 0;
968                  break;
969         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
970                  inputcol    += L1_rowsleft;
971                  rowsleft    -= L1_rowsleft;
972                  L1_rowsleft  = 0;
973                  break;
974         default: MatMatMultSymbolic_RowMergeMacro(8);
975                  inputcol    += 8;
976                  rowsleft    -= 8;
977                  L1_rowsleft -= 8;
978                  break;
979         }
980         inputcol_L1           = inputcol;
981         L1_nnz               += outputi_nnz;
982         worki_L1[++L1_nrows]  = L1_nnz;
983       }
984 
985       /********************** L E V E L  2 ************************/
986       /* Merge from L1 work array to either C or to L2 work array */
987       if (anzi > 8) {
988         inputi      = worki_L1;
989         inputj      = workj_L1;
990         inputcol    = workcol;
991         outputi_nnz = 0;
992 
993         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
994         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
995 
996         switch (L1_nrows) {
997         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
998                  brow_end[0] = inputj + inputi[inputcol[0]+1];
999                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1000                  break;
1001         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1002         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1003         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1004         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1005         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1006         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1007         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1008         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1009         }
1010         L2_nnz               += outputi_nnz;
1011         worki_L2[++L2_nrows]  = L2_nnz;
1012 
1013         /************************ L E V E L  3 **********************/
1014         /* Merge from L2 work array to either C or to L2 work array */
1015         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1016           inputi      = worki_L2;
1017           inputj      = workj_L2;
1018           inputcol    = workcol;
1019           outputi_nnz = 0;
1020           if (rowsleft) outputj = workj_L3;
1021           else          outputj = cj + ci_nnz;
1022           switch (L2_nrows) {
1023           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1024                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1025                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1026                    break;
1027           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1028           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1029           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1030           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1031           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1032           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1033           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1034           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1035           }
1036           L2_nrows    = 1;
1037           L2_nnz      = outputi_nnz;
1038           worki_L2[1] = outputi_nnz;
1039           /* Copy to workj_L2 */
1040           if (rowsleft) {
1041             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1042           }
1043         }
1044       }
1045     }  /* while (rowsleft) */
1046 #undef MatMatMultSymbolic_RowMergeMacro
1047 
1048     /* terminate current row */
1049     ci_nnz += outputi_nnz;
1050     ci[i+1] = ci_nnz;
1051   }
1052 
1053   /* Step 3: Create the new symbolic matrix */
1054   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1055   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1056 
1057   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1058   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1059   c          = (Mat_SeqAIJ*)(C->data);
1060   c->free_a  = PETSC_TRUE;
1061   c->free_ij = PETSC_TRUE;
1062   c->nonew   = 0;
1063 
1064   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1065 
1066   /* set MatInfo */
1067   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1068   if (afill < 1.0) afill = 1.0;
1069   C->info.mallocs           = ndouble;
1070   C->info.fill_ratio_given  = fill;
1071   C->info.fill_ratio_needed = afill;
1072 
1073 #if defined(PETSC_USE_INFO)
1074   if (ci[am]) {
1075     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1076     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1077   } else {
1078     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1079   }
1080 #endif
1081 
1082   /* Step 4: Free temporary work areas */
1083   PetscCall(PetscFree(workj_L1));
1084   PetscCall(PetscFree(workj_L2));
1085   PetscCall(PetscFree(workj_L3));
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* concatenate unique entries and then sort */
1090 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1091 {
1092   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1093   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1094   PetscInt       *ci,*cj,bcol;
1095   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1096   PetscReal      afill;
1097   PetscInt       i,j,ndouble = 0;
1098   PetscSegBuffer seg,segrow;
1099   char           *seen;
1100 
1101   PetscFunctionBegin;
1102   PetscCall(PetscMalloc1(am+1,&ci));
1103   ci[0] = 0;
1104 
1105   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1106   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1107   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1108   PetscCall(PetscCalloc1(bn,&seen));
1109 
1110   /* Determine ci and cj */
1111   for (i=0; i<am; i++) {
1112     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1113     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1114     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1115 
1116     /* Pack segrow */
1117     for (j=0; j<anzi; j++) {
1118       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1119       for (k=bjstart; k<bjend; k++) {
1120         bcol = bj[k];
1121         if (!seen[bcol]) { /* new entry */
1122           PetscInt *PETSC_RESTRICT slot;
1123           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1124           *slot = bcol;
1125           seen[bcol] = 1;
1126           packlen++;
1127         }
1128       }
1129     }
1130 
1131     /* Check i-th diagonal entry */
1132     if (C->force_diagonals && !seen[i]) {
1133       PetscInt *PETSC_RESTRICT slot;
1134       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1135       *slot   = i;
1136       seen[i] = 1;
1137       packlen++;
1138     }
1139 
1140     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1141     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1142     PetscCall(PetscSortInt(packlen,crow));
1143     ci[i+1] = ci[i] + packlen;
1144     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1145   }
1146   PetscCall(PetscSegBufferDestroy(&segrow));
1147   PetscCall(PetscFree(seen));
1148 
1149   /* Column indices are in the segmented buffer */
1150   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1151   PetscCall(PetscSegBufferDestroy(&seg));
1152 
1153   /* put together the new symbolic matrix */
1154   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1155   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1156 
1157   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1158   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1159   c          = (Mat_SeqAIJ*)(C->data);
1160   c->free_a  = PETSC_TRUE;
1161   c->free_ij = PETSC_TRUE;
1162   c->nonew   = 0;
1163 
1164   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1165 
1166   /* set MatInfo */
1167   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1168   if (afill < 1.0) afill = 1.0;
1169   C->info.mallocs           = ndouble;
1170   C->info.fill_ratio_given  = fill;
1171   C->info.fill_ratio_needed = afill;
1172 
1173 #if defined(PETSC_USE_INFO)
1174   if (ci[am]) {
1175     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1176     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1177   } else {
1178     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1179   }
1180 #endif
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1185 {
1186   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1190   PetscCall(MatDestroy(&abt->Bt_den));
1191   PetscCall(MatDestroy(&abt->ABt_den));
1192   PetscCall(PetscFree(abt));
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1197 {
1198   Mat                 Bt;
1199   Mat_MatMatTransMult *abt;
1200   Mat_Product         *product = C->product;
1201   char                *alg;
1202 
1203   PetscFunctionBegin;
1204   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1205   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1206 
1207   /* create symbolic Bt */
1208   PetscCall(MatTransposeSymbolic(B,&Bt));
1209   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1210   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1211 
1212   /* get symbolic C=A*Bt */
1213   PetscCall(PetscStrallocpy(product->alg,&alg));
1214   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1215   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1216   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1217   PetscCall(PetscFree(alg));
1218 
1219   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1220   PetscCall(PetscNew(&abt));
1221 
1222   product->data    = abt;
1223   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1224 
1225   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1226 
1227   abt->usecoloring = PETSC_FALSE;
1228   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1229   if (abt->usecoloring) {
1230     /* Create MatTransposeColoring from symbolic C=A*B^T */
1231     MatTransposeColoring matcoloring;
1232     MatColoring          coloring;
1233     ISColoring           iscoloring;
1234     Mat                  Bt_dense,C_dense;
1235 
1236     /* inode causes memory problem */
1237     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1238 
1239     PetscCall(MatColoringCreate(C,&coloring));
1240     PetscCall(MatColoringSetDistance(coloring,2));
1241     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1242     PetscCall(MatColoringSetFromOptions(coloring));
1243     PetscCall(MatColoringApply(coloring,&iscoloring));
1244     PetscCall(MatColoringDestroy(&coloring));
1245     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1246 
1247     abt->matcoloring = matcoloring;
1248 
1249     PetscCall(ISColoringDestroy(&iscoloring));
1250 
1251     /* Create Bt_dense and C_dense = A*Bt_dense */
1252     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1253     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1254     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1255     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1256 
1257     Bt_dense->assembled = PETSC_TRUE;
1258     abt->Bt_den         = Bt_dense;
1259 
1260     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1261     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1262     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1263     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1264 
1265     Bt_dense->assembled = PETSC_TRUE;
1266     abt->ABt_den  = C_dense;
1267 
1268 #if defined(PETSC_USE_INFO)
1269     {
1270       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1271       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1272     }
1273 #endif
1274   }
1275   /* clean up */
1276   PetscCall(MatDestroy(&Bt));
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1281 {
1282   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1283   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1284   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1285   PetscLogDouble      flops=0.0;
1286   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1287   Mat_MatMatTransMult *abt;
1288   Mat_Product         *product = C->product;
1289 
1290   PetscFunctionBegin;
1291   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1292   abt = (Mat_MatMatTransMult *)product->data;
1293   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1294   /* clear old values in C */
1295   if (!c->a) {
1296     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1297     c->a      = ca;
1298     c->free_a = PETSC_TRUE;
1299   } else {
1300     ca =  c->a;
1301     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1302   }
1303 
1304   if (abt->usecoloring) {
1305     MatTransposeColoring matcoloring = abt->matcoloring;
1306     Mat                  Bt_dense,C_dense = abt->ABt_den;
1307 
1308     /* Get Bt_dense by Apply MatTransposeColoring to B */
1309     Bt_dense = abt->Bt_den;
1310     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1311 
1312     /* C_dense = A*Bt_dense */
1313     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1314 
1315     /* Recover C from C_dense */
1316     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1317     PetscFunctionReturn(0);
1318   }
1319 
1320   for (i=0; i<cm; i++) {
1321     anzi = ai[i+1] - ai[i];
1322     acol = aj + ai[i];
1323     aval = aa + ai[i];
1324     cnzi = ci[i+1] - ci[i];
1325     ccol = cj + ci[i];
1326     cval = ca + ci[i];
1327     for (j=0; j<cnzi; j++) {
1328       brow = ccol[j];
1329       bnzj = bi[brow+1] - bi[brow];
1330       bcol = bj + bi[brow];
1331       bval = ba + bi[brow];
1332 
1333       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1334       nexta = 0; nextb = 0;
1335       while (nexta<anzi && nextb<bnzj) {
1336         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1337         if (nexta == anzi) break;
1338         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1339         if (nextb == bnzj) break;
1340         if (acol[nexta] == bcol[nextb]) {
1341           cval[j] += aval[nexta]*bval[nextb];
1342           nexta++; nextb++;
1343           flops += 2;
1344         }
1345       }
1346     }
1347   }
1348   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1349   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1350   PetscCall(PetscLogFlops(flops));
1351   PetscFunctionReturn(0);
1352 }
1353 
1354 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1355 {
1356   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1357 
1358   PetscFunctionBegin;
1359   PetscCall(MatDestroy(&atb->At));
1360   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1361   PetscCall(PetscFree(atb));
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1366 {
1367   Mat            At = NULL;
1368   Mat_Product    *product = C->product;
1369   PetscBool      flg,def,square;
1370 
1371   PetscFunctionBegin;
1372   MatCheckProduct(C,4);
1373   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1374   /* outerproduct */
1375   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1376   if (flg) {
1377     /* create symbolic At */
1378     if (!square) {
1379       PetscCall(MatTransposeSymbolic(A,&At));
1380       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1381       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1382     }
1383     /* get symbolic C=At*B */
1384     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1385     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1386 
1387     /* clean up */
1388     if (!square) {
1389       PetscCall(MatDestroy(&At));
1390     }
1391 
1392     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1393     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1394     PetscFunctionReturn(0);
1395   }
1396 
1397   /* matmatmult */
1398   PetscCall(PetscStrcmp(product->alg,"default",&def));
1399   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1400   if (flg || def) {
1401     Mat_MatTransMatMult *atb;
1402 
1403     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1404     PetscCall(PetscNew(&atb));
1405     if (!square) {
1406       PetscCall(MatTranspose(A,MAT_INITIAL_MATRIX,&At));
1407     }
1408     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1409     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1410     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1411     product->data    = atb;
1412     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1413     atb->At          = At;
1414 
1415     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1416     PetscFunctionReturn(0);
1417   }
1418 
1419   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1420 }
1421 
1422 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1423 {
1424   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1425   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1426   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1427   PetscLogDouble flops=0.0;
1428   MatScalar      *aa=a->a,*ba,*ca,*caj;
1429 
1430   PetscFunctionBegin;
1431   if (!c->a) {
1432     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1433 
1434     c->a      = ca;
1435     c->free_a = PETSC_TRUE;
1436   } else {
1437     ca   = c->a;
1438     PetscCall(PetscArrayzero(ca,ci[cm]));
1439   }
1440 
1441   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1442   for (i=0; i<am; i++) {
1443     bj   = b->j + bi[i];
1444     ba   = b->a + bi[i];
1445     bnzi = bi[i+1] - bi[i];
1446     anzi = ai[i+1] - ai[i];
1447     for (j=0; j<anzi; j++) {
1448       nextb = 0;
1449       crow  = *aj++;
1450       cjj   = cj + ci[crow];
1451       caj   = ca + ci[crow];
1452       /* perform sparse axpy operation.  Note cjj includes bj. */
1453       for (k=0; nextb<bnzi; k++) {
1454         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1455           caj[k] += (*aa)*(*(ba+nextb));
1456           nextb++;
1457         }
1458       }
1459       flops += 2*bnzi;
1460       aa++;
1461     }
1462   }
1463 
1464   /* Assemble the final matrix and clean up */
1465   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1466   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1467   PetscCall(PetscLogFlops(flops));
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1472 {
1473   PetscFunctionBegin;
1474   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1475   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1480 {
1481   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1482   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1483   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1484   const PetscInt    *aj;
1485   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1486   PetscInt          clda;
1487   PetscInt          am4,bm4,col,i,j,n;
1488 
1489   PetscFunctionBegin;
1490   if (!cm || !cn) PetscFunctionReturn(0);
1491   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1492   if (add) {
1493     PetscCall(MatDenseGetArray(C,&c));
1494   } else {
1495     PetscCall(MatDenseGetArrayWrite(C,&c));
1496   }
1497   PetscCall(MatDenseGetArrayRead(B,&b));
1498   PetscCall(MatDenseGetLDA(B,&bm));
1499   PetscCall(MatDenseGetLDA(C,&clda));
1500   am4 = 4*clda;
1501   bm4 = 4*bm;
1502   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1503   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1504   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1505     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1506       r1 = r2 = r3 = r4 = 0.0;
1507       n  = a->i[i+1] - a->i[i];
1508       aj = a->j + a->i[i];
1509       aa = av + a->i[i];
1510       for (j=0; j<n; j++) {
1511         const PetscScalar aatmp = aa[j];
1512         const PetscInt    ajtmp = aj[j];
1513         r1 += aatmp*b1[ajtmp];
1514         r2 += aatmp*b2[ajtmp];
1515         r3 += aatmp*b3[ajtmp];
1516         r4 += aatmp*b4[ajtmp];
1517       }
1518       if (add) {
1519         c1[i] += r1;
1520         c2[i] += r2;
1521         c3[i] += r3;
1522         c4[i] += r4;
1523       } else {
1524         c1[i] = r1;
1525         c2[i] = r2;
1526         c3[i] = r3;
1527         c4[i] = r4;
1528       }
1529     }
1530     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1531     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1532   }
1533   /* process remaining columns */
1534   if (col != cn) {
1535     PetscInt rc = cn-col;
1536 
1537     if (rc == 1) {
1538       for (i=0; i<am; i++) {
1539         r1 = 0.0;
1540         n  = a->i[i+1] - a->i[i];
1541         aj = a->j + a->i[i];
1542         aa = av + a->i[i];
1543         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1544         if (add) c1[i] += r1;
1545         else c1[i] = r1;
1546       }
1547     } else if (rc == 2) {
1548       for (i=0; i<am; i++) {
1549         r1 = r2 = 0.0;
1550         n  = a->i[i+1] - a->i[i];
1551         aj = a->j + a->i[i];
1552         aa = av + a->i[i];
1553         for (j=0; j<n; j++) {
1554           const PetscScalar aatmp = aa[j];
1555           const PetscInt    ajtmp = aj[j];
1556           r1 += aatmp*b1[ajtmp];
1557           r2 += aatmp*b2[ajtmp];
1558         }
1559         if (add) {
1560           c1[i] += r1;
1561           c2[i] += r2;
1562         } else {
1563           c1[i] = r1;
1564           c2[i] = r2;
1565         }
1566       }
1567     } else {
1568       for (i=0; i<am; i++) {
1569         r1 = r2 = r3 = 0.0;
1570         n  = a->i[i+1] - a->i[i];
1571         aj = a->j + a->i[i];
1572         aa = av + a->i[i];
1573         for (j=0; j<n; j++) {
1574           const PetscScalar aatmp = aa[j];
1575           const PetscInt    ajtmp = aj[j];
1576           r1 += aatmp*b1[ajtmp];
1577           r2 += aatmp*b2[ajtmp];
1578           r3 += aatmp*b3[ajtmp];
1579         }
1580         if (add) {
1581           c1[i] += r1;
1582           c2[i] += r2;
1583           c3[i] += r3;
1584         } else {
1585           c1[i] = r1;
1586           c2[i] = r2;
1587           c3[i] = r3;
1588         }
1589       }
1590     }
1591   }
1592   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1593   if (add) {
1594     PetscCall(MatDenseRestoreArray(C,&c));
1595   } else {
1596     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1597   }
1598   PetscCall(MatDenseRestoreArrayRead(B,&b));
1599   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1604 {
1605   PetscFunctionBegin;
1606   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1607   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1608   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1609 
1610   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 /* ------------------------------------------------------- */
1615 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1616 {
1617   PetscFunctionBegin;
1618   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1619   C->ops->productsymbolic = MatProductSymbolic_AB;
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1624 
1625 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1626 {
1627   PetscFunctionBegin;
1628   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1629   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1634 {
1635   PetscFunctionBegin;
1636   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1637   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1642 {
1643   Mat_Product    *product = C->product;
1644 
1645   PetscFunctionBegin;
1646   switch (product->type) {
1647   case MATPRODUCT_AB:
1648     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1649     break;
1650   case MATPRODUCT_AtB:
1651     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1652     break;
1653   case MATPRODUCT_ABt:
1654     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1655     break;
1656   default:
1657     break;
1658   }
1659   PetscFunctionReturn(0);
1660 }
1661 /* ------------------------------------------------------- */
1662 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1663 {
1664   Mat_Product    *product = C->product;
1665   Mat            A = product->A;
1666   PetscBool      baij;
1667 
1668   PetscFunctionBegin;
1669   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1670   if (!baij) { /* A is seqsbaij */
1671     PetscBool sbaij;
1672     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1673     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1674 
1675     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1676   } else { /* A is seqbaij */
1677     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1678   }
1679 
1680   C->ops->productsymbolic = MatProductSymbolic_AB;
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1685 {
1686   Mat_Product    *product = C->product;
1687 
1688   PetscFunctionBegin;
1689   MatCheckProduct(C,1);
1690   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1691   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 /* ------------------------------------------------------- */
1696 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1697 {
1698   PetscFunctionBegin;
1699   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1700   C->ops->productsymbolic = MatProductSymbolic_AB;
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1705 {
1706   Mat_Product    *product = C->product;
1707 
1708   PetscFunctionBegin;
1709   if (product->type == MATPRODUCT_AB) {
1710     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1711   }
1712   PetscFunctionReturn(0);
1713 }
1714 /* ------------------------------------------------------- */
1715 
1716 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1717 {
1718   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1719   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1720   PetscInt       *bi      = b->i,*bj=b->j;
1721   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1722   MatScalar      *btval,*btval_den,*ba=b->a;
1723   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1724 
1725   PetscFunctionBegin;
1726   btval_den=btdense->v;
1727   PetscCall(PetscArrayzero(btval_den,m*n));
1728   for (k=0; k<ncolors; k++) {
1729     ncolumns = coloring->ncolumns[k];
1730     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1731       col   = *(columns + colorforcol[k] + l);
1732       btcol = bj + bi[col];
1733       btval = ba + bi[col];
1734       anz   = bi[col+1] - bi[col];
1735       for (j=0; j<anz; j++) {
1736         brow            = btcol[j];
1737         btval_den[brow] = btval[j];
1738       }
1739     }
1740     btval_den += m;
1741   }
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1746 {
1747   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1748   const PetscScalar *ca_den,*ca_den_ptr;
1749   PetscScalar       *ca=csp->a;
1750   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1751   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1752   PetscInt          nrows,*row,*idx;
1753   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1754 
1755   PetscFunctionBegin;
1756   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1757 
1758   if (brows > 0) {
1759     PetscInt *lstart,row_end,row_start;
1760     lstart = matcoloring->lstart;
1761     PetscCall(PetscArrayzero(lstart,ncolors));
1762 
1763     row_end = brows;
1764     if (row_end > m) row_end = m;
1765     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1766       ca_den_ptr = ca_den;
1767       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1768         nrows = matcoloring->nrows[k];
1769         row   = rows  + colorforrow[k];
1770         idx   = den2sp + colorforrow[k];
1771         for (l=lstart[k]; l<nrows; l++) {
1772           if (row[l] >= row_end) {
1773             lstart[k] = l;
1774             break;
1775           } else {
1776             ca[idx[l]] = ca_den_ptr[row[l]];
1777           }
1778         }
1779         ca_den_ptr += m;
1780       }
1781       row_end += brows;
1782       if (row_end > m) row_end = m;
1783     }
1784   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1785     ca_den_ptr = ca_den;
1786     for (k=0; k<ncolors; k++) {
1787       nrows = matcoloring->nrows[k];
1788       row   = rows  + colorforrow[k];
1789       idx   = den2sp + colorforrow[k];
1790       for (l=0; l<nrows; l++) {
1791         ca[idx[l]] = ca_den_ptr[row[l]];
1792       }
1793       ca_den_ptr += m;
1794     }
1795   }
1796 
1797   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1798 #if defined(PETSC_USE_INFO)
1799   if (matcoloring->brows > 0) {
1800     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1801   } else {
1802     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1803   }
1804 #endif
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1809 {
1810   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1811   const PetscInt *is,*ci,*cj,*row_idx;
1812   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1813   IS             *isa;
1814   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1815   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1816   PetscInt       *colorforcol,*columns,*columns_i,brows;
1817   PetscBool      flg;
1818 
1819   PetscFunctionBegin;
1820   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1821 
1822   /* bs >1 is not being tested yet! */
1823   Nbs       = mat->cmap->N/bs;
1824   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1825   c->N      = Nbs;
1826   c->m      = c->M;
1827   c->rstart = 0;
1828   c->brows  = 100;
1829 
1830   c->ncolors = nis;
1831   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1832   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1833   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1834 
1835   brows = c->brows;
1836   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1837   if (flg) c->brows = brows;
1838   if (brows > 0) {
1839     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1840   }
1841 
1842   colorforrow[0] = 0;
1843   rows_i         = rows;
1844   den2sp_i       = den2sp;
1845 
1846   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1847   PetscCall(PetscMalloc1(Nbs+1,&columns));
1848 
1849   colorforcol[0] = 0;
1850   columns_i      = columns;
1851 
1852   /* get column-wise storage of mat */
1853   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1854 
1855   cm   = c->m;
1856   PetscCall(PetscMalloc1(cm+1,&rowhit));
1857   PetscCall(PetscMalloc1(cm+1,&idxhit));
1858   for (i=0; i<nis; i++) { /* loop over color */
1859     PetscCall(ISGetLocalSize(isa[i],&n));
1860     PetscCall(ISGetIndices(isa[i],&is));
1861 
1862     c->ncolumns[i] = n;
1863     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1864     colorforcol[i+1] = colorforcol[i] + n;
1865     columns_i       += n;
1866 
1867     /* fast, crude version requires O(N*N) work */
1868     PetscCall(PetscArrayzero(rowhit,cm));
1869 
1870     for (j=0; j<n; j++) { /* loop over columns*/
1871       col     = is[j];
1872       row_idx = cj + ci[col];
1873       m       = ci[col+1] - ci[col];
1874       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1875         idxhit[*row_idx]   = spidx[ci[col] + k];
1876         rowhit[*row_idx++] = col + 1;
1877       }
1878     }
1879     /* count the number of hits */
1880     nrows = 0;
1881     for (j=0; j<cm; j++) {
1882       if (rowhit[j]) nrows++;
1883     }
1884     c->nrows[i]      = nrows;
1885     colorforrow[i+1] = colorforrow[i] + nrows;
1886 
1887     nrows = 0;
1888     for (j=0; j<cm; j++) { /* loop over rows */
1889       if (rowhit[j]) {
1890         rows_i[nrows]   = j;
1891         den2sp_i[nrows] = idxhit[j];
1892         nrows++;
1893       }
1894     }
1895     den2sp_i += nrows;
1896 
1897     PetscCall(ISRestoreIndices(isa[i],&is));
1898     rows_i += nrows;
1899   }
1900   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1901   PetscCall(PetscFree(rowhit));
1902   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1903   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1904 
1905   c->colorforrow = colorforrow;
1906   c->rows        = rows;
1907   c->den2sp      = den2sp;
1908   c->colorforcol = colorforcol;
1909   c->columns     = columns;
1910 
1911   PetscCall(PetscFree(idxhit));
1912   PetscFunctionReturn(0);
1913 }
1914 
1915 /* --------------------------------------------------------------- */
1916 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1917 {
1918   Mat_Product    *product = C->product;
1919   Mat            A=product->A,B=product->B;
1920 
1921   PetscFunctionBegin;
1922   if (C->ops->mattransposemultnumeric) {
1923     /* Alg: "outerproduct" */
1924     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1925   } else {
1926     /* Alg: "matmatmult" -- C = At*B */
1927     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1928 
1929     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1930     if (atb->At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1931       PetscCall(MatTranspose(A,MAT_REUSE_MATRIX,&atb->At));
1932     }
1933     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A,B,C));
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1939 {
1940   Mat_Product    *product = C->product;
1941   Mat            A=product->A,B=product->B;
1942   PetscReal      fill=product->fill;
1943 
1944   PetscFunctionBegin;
1945   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1946 
1947   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1948   PetscFunctionReturn(0);
1949 }
1950 
1951 /* --------------------------------------------------------------- */
1952 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1953 {
1954   Mat_Product    *product = C->product;
1955   PetscInt       alg = 0; /* default algorithm */
1956   PetscBool      flg = PETSC_FALSE;
1957 #if !defined(PETSC_HAVE_HYPRE)
1958   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1959   PetscInt       nalg = 7;
1960 #else
1961   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1962   PetscInt       nalg = 8;
1963 #endif
1964 
1965   PetscFunctionBegin;
1966   /* Set default algorithm */
1967   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1968   if (flg) {
1969     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1970   }
1971 
1972   /* Get runtime option */
1973   if (product->api_user) {
1974     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1975     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1976     PetscOptionsEnd();
1977   } else {
1978     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1979     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1980     PetscOptionsEnd();
1981   }
1982   if (flg) {
1983     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1984   }
1985 
1986   C->ops->productsymbolic = MatProductSymbolic_AB;
1987   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1988   PetscFunctionReturn(0);
1989 }
1990 
1991 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1992 {
1993   Mat_Product    *product = C->product;
1994   PetscInt       alg = 0; /* default algorithm */
1995   PetscBool      flg = PETSC_FALSE;
1996   const char     *algTypes[3] = {"default","at*b","outerproduct"};
1997   PetscInt       nalg = 3;
1998 
1999   PetscFunctionBegin;
2000   /* Get runtime option */
2001   if (product->api_user) {
2002     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2003     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2004     PetscOptionsEnd();
2005   } else {
2006     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2007     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2008     PetscOptionsEnd();
2009   }
2010   if (flg) {
2011     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2012   }
2013 
2014   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2015   PetscFunctionReturn(0);
2016 }
2017 
2018 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2019 {
2020   Mat_Product    *product = C->product;
2021   PetscInt       alg = 0; /* default algorithm */
2022   PetscBool      flg = PETSC_FALSE;
2023   const char     *algTypes[2] = {"default","color"};
2024   PetscInt       nalg = 2;
2025 
2026   PetscFunctionBegin;
2027   /* Set default algorithm */
2028   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2029   if (!flg) {
2030     alg = 1;
2031     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2032   }
2033 
2034   /* Get runtime option */
2035   if (product->api_user) {
2036     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2037     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2038     PetscOptionsEnd();
2039   } else {
2040     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2041     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2042     PetscOptionsEnd();
2043   }
2044   if (flg) {
2045     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2046   }
2047 
2048   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2049   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2054 {
2055   Mat_Product    *product = C->product;
2056   PetscBool      flg = PETSC_FALSE;
2057   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2058 #if !defined(PETSC_HAVE_HYPRE)
2059   const char     *algTypes[2] = {"scalable","rap"};
2060   PetscInt       nalg = 2;
2061 #else
2062   const char     *algTypes[3] = {"scalable","rap","hypre"};
2063   PetscInt       nalg = 3;
2064 #endif
2065 
2066   PetscFunctionBegin;
2067   /* Set default algorithm */
2068   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2069   if (flg) {
2070     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2071   }
2072 
2073   /* Get runtime option */
2074   if (product->api_user) {
2075     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2076     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2077     PetscOptionsEnd();
2078   } else {
2079     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2080     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2081     PetscOptionsEnd();
2082   }
2083   if (flg) {
2084     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2085   }
2086 
2087   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2088   PetscFunctionReturn(0);
2089 }
2090 
2091 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2092 {
2093   Mat_Product    *product = C->product;
2094   PetscBool      flg = PETSC_FALSE;
2095   PetscInt       alg = 0; /* default algorithm */
2096   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2097   PetscInt        nalg = 3;
2098 
2099   PetscFunctionBegin;
2100   /* Set default algorithm */
2101   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2102   if (flg) {
2103     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2104   }
2105 
2106   /* Get runtime option */
2107   if (product->api_user) {
2108     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2109     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2110     PetscOptionsEnd();
2111   } else {
2112     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2113     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2114     PetscOptionsEnd();
2115   }
2116   if (flg) {
2117     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2118   }
2119 
2120   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2121   PetscFunctionReturn(0);
2122 }
2123 
2124 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2125 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2126 {
2127   Mat_Product    *product = C->product;
2128   PetscInt       alg = 0; /* default algorithm */
2129   PetscBool      flg = PETSC_FALSE;
2130   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2131   PetscInt       nalg = 7;
2132 
2133   PetscFunctionBegin;
2134   /* Set default algorithm */
2135   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2136   if (flg) {
2137     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2138   }
2139 
2140   /* Get runtime option */
2141   if (product->api_user) {
2142     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2143     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2144     PetscOptionsEnd();
2145   } else {
2146     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2147     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2148     PetscOptionsEnd();
2149   }
2150   if (flg) {
2151     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2152   }
2153 
2154   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2155   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2156   PetscFunctionReturn(0);
2157 }
2158 
2159 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2160 {
2161   Mat_Product    *product = C->product;
2162 
2163   PetscFunctionBegin;
2164   switch (product->type) {
2165   case MATPRODUCT_AB:
2166     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2167     break;
2168   case MATPRODUCT_AtB:
2169     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2170     break;
2171   case MATPRODUCT_ABt:
2172     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2173     break;
2174   case MATPRODUCT_PtAP:
2175     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2176     break;
2177   case MATPRODUCT_RARt:
2178     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2179     break;
2180   case MATPRODUCT_ABC:
2181     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2182     break;
2183   default:
2184     break;
2185   }
2186   PetscFunctionReturn(0);
2187 }
2188