xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision fdf6c4e30aafdbc795e4f76379caa977fd5cdf5a)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) {
17     PetscCheck(C->ops->matmultnumeric != MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
18     PetscCall((*C->ops->matmultnumeric)(A,B,C));
19   } else {
20     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
21   }
22   PetscFunctionReturn(0);
23 }
24 
25 /* Modified from MatCreateSeqAIJWithArrays() */
26 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
27 {
28   PetscInt       ii;
29   Mat_SeqAIJ     *aij;
30   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
31 
32   PetscFunctionBegin;
33   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
34   PetscCall(MatSetSizes(mat,m,n,m,n));
35 
36   if (!mtype) {
37     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
38     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
39   } else {
40     PetscCall(MatSetType(mat,mtype));
41   }
42 
43   aij  = (Mat_SeqAIJ*)(mat)->data;
44   osingle = aij->singlemalloc;
45   ofree_a = aij->free_a;
46   ofree_ij = aij->free_ij;
47   /* changes the free flags */
48   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
49 
50   PetscCall(PetscFree(aij->ilen));
51   PetscCall(PetscFree(aij->imax));
52   PetscCall(PetscMalloc1(m,&aij->imax));
53   PetscCall(PetscMalloc1(m,&aij->ilen));
54   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
55     const PetscInt rnz = i[ii+1] - i[ii];
56     aij->nonzerorowcnt += !!rnz;
57     aij->rmax = PetscMax(aij->rmax,rnz);
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60   aij->maxnz = i[m];
61   aij->nz = i[m];
62 
63   if (osingle) {
64     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
65   } else {
66     if (ofree_a)  PetscCall(PetscFree(aij->a));
67     if (ofree_ij) PetscCall(PetscFree(aij->j));
68     if (ofree_ij) PetscCall(PetscFree(aij->i));
69   }
70   aij->i            = i;
71   aij->j            = j;
72   aij->a            = a;
73   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
74   /* default to not retain ownership */
75   aij->singlemalloc = PETSC_FALSE;
76   aij->free_a       = PETSC_FALSE;
77   aij->free_ij      = PETSC_FALSE;
78   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
79   PetscFunctionReturn(0);
80 }
81 
82 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
83 {
84   Mat_Product         *product = C->product;
85   MatProductAlgorithm alg;
86   PetscBool           flg;
87 
88   PetscFunctionBegin;
89   if (product) {
90     alg = product->alg;
91   } else {
92     alg = "sorted";
93   }
94   /* sorted */
95   PetscCall(PetscStrcmp(alg,"sorted",&flg));
96   if (flg) {
97     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
98     PetscFunctionReturn(0);
99   }
100 
101   /* scalable */
102   PetscCall(PetscStrcmp(alg,"scalable",&flg));
103   if (flg) {
104     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
105     PetscFunctionReturn(0);
106   }
107 
108   /* scalable_fast */
109   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
110   if (flg) {
111     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
112     PetscFunctionReturn(0);
113   }
114 
115   /* heap */
116   PetscCall(PetscStrcmp(alg,"heap",&flg));
117   if (flg) {
118     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
119     PetscFunctionReturn(0);
120   }
121 
122   /* btheap */
123   PetscCall(PetscStrcmp(alg,"btheap",&flg));
124   if (flg) {
125     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
126     PetscFunctionReturn(0);
127   }
128 
129   /* llcondensed */
130   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
131   if (flg) {
132     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
133     PetscFunctionReturn(0);
134   }
135 
136   /* rowmerge */
137   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
138   if (flg) {
139     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
140     PetscFunctionReturn(0);
141   }
142 
143 #if defined(PETSC_HAVE_HYPRE)
144   PetscCall(PetscStrcmp(alg,"hypre",&flg));
145   if (flg) {
146     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
147     PetscFunctionReturn(0);
148   }
149 #endif
150 
151   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
152 }
153 
154 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
155 {
156   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
157   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
158   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
159   PetscReal          afill;
160   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
161   PetscTable         ta;
162   PetscBT            lnkbt;
163   PetscFreeSpaceList free_space=NULL,current_space=NULL;
164 
165   PetscFunctionBegin;
166   /* Get ci and cj */
167   /*---------------*/
168   /* Allocate ci array, arrays for fill computation and */
169   /* free space for accumulating nonzero column info */
170   PetscCall(PetscMalloc1(am+2,&ci));
171   ci[0] = 0;
172 
173   /* create and initialize a linked list */
174   PetscCall(PetscTableCreate(bn,bn,&ta));
175   MatRowMergeMax_SeqAIJ(b,bm,ta);
176   PetscCall(PetscTableGetCount(ta,&Crmax));
177   PetscCall(PetscTableDestroy(&ta));
178 
179   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
180 
181   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
182   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
183 
184   current_space = free_space;
185 
186   /* Determine ci and cj */
187   for (i=0; i<am; i++) {
188     anzi = ai[i+1] - ai[i];
189     aj   = a->j + ai[i];
190     for (j=0; j<anzi; j++) {
191       brow = aj[j];
192       bnzj = bi[brow+1] - bi[brow];
193       bj   = b->j + bi[brow];
194       /* add non-zero cols of B into the sorted linked list lnk */
195       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
196     }
197     /* add possible missing diagonal entry */
198     if (C->force_diagonals) {
199       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
200     }
201     cnzi = lnk[0];
202 
203     /* If free space is not available, make more free space */
204     /* Double the amount of total space in the list */
205     if (current_space->local_remaining<cnzi) {
206       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
207       ndouble++;
208     }
209 
210     /* Copy data into free space, then initialize lnk */
211     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
212 
213     current_space->array           += cnzi;
214     current_space->local_used      += cnzi;
215     current_space->local_remaining -= cnzi;
216 
217     ci[i+1] = ci[i] + cnzi;
218   }
219 
220   /* Column indices are in the list of free space */
221   /* Allocate space for cj, initialize cj, and */
222   /* destroy list of free space and other temporary array(s) */
223   PetscCall(PetscMalloc1(ci[am]+1,&cj));
224   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
225   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
226 
227   /* put together the new symbolic matrix */
228   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
229   PetscCall(MatSetBlockSizesFromMats(C,A,B));
230 
231   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
232   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
233   c          = (Mat_SeqAIJ*)(C->data);
234   c->free_a  = PETSC_FALSE;
235   c->free_ij = PETSC_TRUE;
236   c->nonew   = 0;
237 
238   /* fast, needs non-scalable O(bn) array 'abdense' */
239   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
240 
241   /* set MatInfo */
242   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
243   if (afill < 1.0) afill = 1.0;
244   C->info.mallocs           = ndouble;
245   C->info.fill_ratio_given  = fill;
246   C->info.fill_ratio_needed = afill;
247 
248 #if defined(PETSC_USE_INFO)
249   if (ci[am]) {
250     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
251     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
252   } else {
253     PetscCall(PetscInfo(C,"Empty matrix product\n"));
254   }
255 #endif
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
260 {
261   PetscLogDouble    flops=0.0;
262   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
263   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
264   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
265   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
266   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
267   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
268   PetscScalar       *ca,valtmp;
269   PetscScalar       *ab_dense;
270   PetscContainer    cab_dense;
271   const PetscScalar *aa,*ba,*baj;
272 
273   PetscFunctionBegin;
274   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
275   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
276   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
277     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
278     c->a      = ca;
279     c->free_a = PETSC_TRUE;
280   } else ca = c->a;
281 
282   /* TODO this should be done in the symbolic phase */
283   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
284      that is hard to eradicate) */
285   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
286   if (!cab_dense) {
287     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
288     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
289     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
290     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
291     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
292     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
293   }
294   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
295   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
296 
297   /* clean old values in C */
298   PetscCall(PetscArrayzero(ca,ci[cm]));
299   /* Traverse A row-wise. */
300   /* Build the ith row in C by summing over nonzero columns in A, */
301   /* the rows of B corresponding to nonzeros of A. */
302   for (i=0; i<am; i++) {
303     anzi = ai[i+1] - ai[i];
304     for (j=0; j<anzi; j++) {
305       brow = aj[j];
306       bnzi = bi[brow+1] - bi[brow];
307       bjj  = bj + bi[brow];
308       baj  = ba + bi[brow];
309       /* perform dense axpy */
310       valtmp = aa[j];
311       for (k=0; k<bnzi; k++) {
312         ab_dense[bjj[k]] += valtmp*baj[k];
313       }
314       flops += 2*bnzi;
315     }
316     aj += anzi; aa += anzi;
317 
318     cnzi = ci[i+1] - ci[i];
319     for (k=0; k<cnzi; k++) {
320       ca[k]          += ab_dense[cj[k]];
321       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
322     }
323     flops += cnzi;
324     cj    += cnzi; ca += cnzi;
325   }
326 #if defined(PETSC_HAVE_DEVICE)
327   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
328 #endif
329   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
330   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
331   PetscCall(PetscLogFlops(flops));
332   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
333   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
334   PetscFunctionReturn(0);
335 }
336 
337 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
338 {
339   PetscLogDouble    flops=0.0;
340   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
341   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
342   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
343   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
344   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
345   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
346   PetscScalar       *ca=c->a,valtmp;
347   const PetscScalar *aa,*ba,*baj;
348   PetscInt          nextb;
349 
350   PetscFunctionBegin;
351   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
352   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
353   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
354     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
355     c->a      = ca;
356     c->free_a = PETSC_TRUE;
357   }
358 
359   /* clean old values in C */
360   PetscCall(PetscArrayzero(ca,ci[cm]));
361   /* Traverse A row-wise. */
362   /* Build the ith row in C by summing over nonzero columns in A, */
363   /* the rows of B corresponding to nonzeros of A. */
364   for (i=0; i<am; i++) {
365     anzi = ai[i+1] - ai[i];
366     cnzi = ci[i+1] - ci[i];
367     for (j=0; j<anzi; j++) {
368       brow = aj[j];
369       bnzi = bi[brow+1] - bi[brow];
370       bjj  = bj + bi[brow];
371       baj  = ba + bi[brow];
372       /* perform sparse axpy */
373       valtmp = aa[j];
374       nextb  = 0;
375       for (k=0; nextb<bnzi; k++) {
376         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
377           ca[k] += valtmp*baj[nextb++];
378         }
379       }
380       flops += 2*bnzi;
381     }
382     aj += anzi; aa += anzi;
383     cj += cnzi; ca += cnzi;
384   }
385 #if defined(PETSC_HAVE_DEVICE)
386   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
387 #endif
388   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
389   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
390   PetscCall(PetscLogFlops(flops));
391   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
392   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
393   PetscFunctionReturn(0);
394 }
395 
396 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
397 {
398   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
399   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
400   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
401   MatScalar          *ca;
402   PetscReal          afill;
403   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
404   PetscTable         ta;
405   PetscFreeSpaceList free_space=NULL,current_space=NULL;
406 
407   PetscFunctionBegin;
408   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
409   /*-----------------------------------------------------------------------------------------*/
410   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
411   PetscCall(PetscMalloc1(am+2,&ci));
412   ci[0] = 0;
413 
414   /* create and initialize a linked list */
415   PetscCall(PetscTableCreate(bn,bn,&ta));
416   MatRowMergeMax_SeqAIJ(b,bm,ta);
417   PetscCall(PetscTableGetCount(ta,&Crmax));
418   PetscCall(PetscTableDestroy(&ta));
419 
420   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
421 
422   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
423   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
424   current_space = free_space;
425 
426   /* Determine ci and cj */
427   for (i=0; i<am; i++) {
428     anzi = ai[i+1] - ai[i];
429     aj   = a->j + ai[i];
430     for (j=0; j<anzi; j++) {
431       brow = aj[j];
432       bnzj = bi[brow+1] - bi[brow];
433       bj   = b->j + bi[brow];
434       /* add non-zero cols of B into the sorted linked list lnk */
435       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
436     }
437     /* add possible missing diagonal entry */
438     if (C->force_diagonals) {
439       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
440     }
441     cnzi = lnk[1];
442 
443     /* If free space is not available, make more free space */
444     /* Double the amount of total space in the list */
445     if (current_space->local_remaining<cnzi) {
446       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
447       ndouble++;
448     }
449 
450     /* Copy data into free space, then initialize lnk */
451     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
452 
453     current_space->array           += cnzi;
454     current_space->local_used      += cnzi;
455     current_space->local_remaining -= cnzi;
456 
457     ci[i+1] = ci[i] + cnzi;
458   }
459 
460   /* Column indices are in the list of free space */
461   /* Allocate space for cj, initialize cj, and */
462   /* destroy list of free space and other temporary array(s) */
463   PetscCall(PetscMalloc1(ci[am]+1,&cj));
464   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
465   PetscCall(PetscLLCondensedDestroy_fast(lnk));
466 
467   /* Allocate space for ca */
468   PetscCall(PetscCalloc1(ci[am]+1,&ca));
469 
470   /* put together the new symbolic matrix */
471   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
472   PetscCall(MatSetBlockSizesFromMats(C,A,B));
473 
474   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
475   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
476   c          = (Mat_SeqAIJ*)(C->data);
477   c->free_a  = PETSC_TRUE;
478   c->free_ij = PETSC_TRUE;
479   c->nonew   = 0;
480 
481   /* slower, less memory */
482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
483 
484   /* set MatInfo */
485   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
486   if (afill < 1.0) afill = 1.0;
487   C->info.mallocs           = ndouble;
488   C->info.fill_ratio_given  = fill;
489   C->info.fill_ratio_needed = afill;
490 
491 #if defined(PETSC_USE_INFO)
492   if (ci[am]) {
493     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
494     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
495   } else {
496     PetscCall(PetscInfo(C,"Empty matrix product\n"));
497   }
498 #endif
499   PetscFunctionReturn(0);
500 }
501 
502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
503 {
504   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
505   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
506   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
507   MatScalar          *ca;
508   PetscReal          afill;
509   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
510   PetscTable         ta;
511   PetscFreeSpaceList free_space=NULL,current_space=NULL;
512 
513   PetscFunctionBegin;
514   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
515   /*---------------------------------------------------------------------------------------------*/
516   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
517   PetscCall(PetscMalloc1(am+2,&ci));
518   ci[0] = 0;
519 
520   /* create and initialize a linked list */
521   PetscCall(PetscTableCreate(bn,bn,&ta));
522   MatRowMergeMax_SeqAIJ(b,bm,ta);
523   PetscCall(PetscTableGetCount(ta,&Crmax));
524   PetscCall(PetscTableDestroy(&ta));
525   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
526 
527   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
528   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
529   current_space = free_space;
530 
531   /* Determine ci and cj */
532   for (i=0; i<am; i++) {
533     anzi = ai[i+1] - ai[i];
534     aj   = a->j + ai[i];
535     for (j=0; j<anzi; j++) {
536       brow = aj[j];
537       bnzj = bi[brow+1] - bi[brow];
538       bj   = b->j + bi[brow];
539       /* add non-zero cols of B into the sorted linked list lnk */
540       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
541     }
542     /* add possible missing diagonal entry */
543     if (C->force_diagonals) {
544       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
545     }
546 
547     cnzi = lnk[0];
548 
549     /* If free space is not available, make more free space */
550     /* Double the amount of total space in the list */
551     if (current_space->local_remaining<cnzi) {
552       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
553       ndouble++;
554     }
555 
556     /* Copy data into free space, then initialize lnk */
557     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
558 
559     current_space->array           += cnzi;
560     current_space->local_used      += cnzi;
561     current_space->local_remaining -= cnzi;
562 
563     ci[i+1] = ci[i] + cnzi;
564   }
565 
566   /* Column indices are in the list of free space */
567   /* Allocate space for cj, initialize cj, and */
568   /* destroy list of free space and other temporary array(s) */
569   PetscCall(PetscMalloc1(ci[am]+1,&cj));
570   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
571   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
572 
573   /* Allocate space for ca */
574   /*-----------------------*/
575   PetscCall(PetscCalloc1(ci[am]+1,&ca));
576 
577   /* put together the new symbolic matrix */
578   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
579   PetscCall(MatSetBlockSizesFromMats(C,A,B));
580 
581   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
582   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
583   c          = (Mat_SeqAIJ*)(C->data);
584   c->free_a  = PETSC_TRUE;
585   c->free_ij = PETSC_TRUE;
586   c->nonew   = 0;
587 
588   /* slower, less memory */
589   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
590 
591   /* set MatInfo */
592   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
593   if (afill < 1.0) afill = 1.0;
594   C->info.mallocs           = ndouble;
595   C->info.fill_ratio_given  = fill;
596   C->info.fill_ratio_needed = afill;
597 
598 #if defined(PETSC_USE_INFO)
599   if (ci[am]) {
600     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
601     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
602   } else {
603     PetscCall(PetscInfo(C,"Empty matrix product\n"));
604   }
605 #endif
606   PetscFunctionReturn(0);
607 }
608 
609 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
610 {
611   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
612   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
613   PetscInt           *ci,*cj,*bb;
614   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
615   PetscReal          afill;
616   PetscInt           i,j,col,ndouble = 0;
617   PetscFreeSpaceList free_space=NULL,current_space=NULL;
618   PetscHeap          h;
619 
620   PetscFunctionBegin;
621   /* Get ci and cj - by merging sorted rows using a heap */
622   /*---------------------------------------------------------------------------------------------*/
623   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
624   PetscCall(PetscMalloc1(am+2,&ci));
625   ci[0] = 0;
626 
627   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
628   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
629   current_space = free_space;
630 
631   PetscCall(PetscHeapCreate(a->rmax,&h));
632   PetscCall(PetscMalloc1(a->rmax,&bb));
633 
634   /* Determine ci and cj */
635   for (i=0; i<am; i++) {
636     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
637     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
638     ci[i+1] = ci[i];
639     /* Populate the min heap */
640     for (j=0; j<anzi; j++) {
641       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
642       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
643         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
644       }
645     }
646     /* Pick off the min element, adding it to free space */
647     PetscCall(PetscHeapPop(h,&j,&col));
648     while (j >= 0) {
649       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
650         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
651         ndouble++;
652       }
653       *(current_space->array++) = col;
654       current_space->local_used++;
655       current_space->local_remaining--;
656       ci[i+1]++;
657 
658       /* stash if anything else remains in this row of B */
659       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
660       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
661         PetscInt j2,col2;
662         PetscCall(PetscHeapPeek(h,&j2,&col2));
663         if (col2 != col) break;
664         PetscCall(PetscHeapPop(h,&j2,&col2));
665         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
666       }
667       /* Put any stashed elements back into the min heap */
668       PetscCall(PetscHeapUnstash(h));
669       PetscCall(PetscHeapPop(h,&j,&col));
670     }
671   }
672   PetscCall(PetscFree(bb));
673   PetscCall(PetscHeapDestroy(&h));
674 
675   /* Column indices are in the list of free space */
676   /* Allocate space for cj, initialize cj, and */
677   /* destroy list of free space and other temporary array(s) */
678   PetscCall(PetscMalloc1(ci[am],&cj));
679   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
680 
681   /* put together the new symbolic matrix */
682   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
683   PetscCall(MatSetBlockSizesFromMats(C,A,B));
684 
685   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687   c          = (Mat_SeqAIJ*)(C->data);
688   c->free_a  = PETSC_TRUE;
689   c->free_ij = PETSC_TRUE;
690   c->nonew   = 0;
691 
692   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
693 
694   /* set MatInfo */
695   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
696   if (afill < 1.0) afill = 1.0;
697   C->info.mallocs           = ndouble;
698   C->info.fill_ratio_given  = fill;
699   C->info.fill_ratio_needed = afill;
700 
701 #if defined(PETSC_USE_INFO)
702   if (ci[am]) {
703     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
704     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
705   } else {
706     PetscCall(PetscInfo(C,"Empty matrix product\n"));
707   }
708 #endif
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
713 {
714   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
715   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
716   PetscInt           *ci,*cj,*bb;
717   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
718   PetscReal          afill;
719   PetscInt           i,j,col,ndouble = 0;
720   PetscFreeSpaceList free_space=NULL,current_space=NULL;
721   PetscHeap          h;
722   PetscBT            bt;
723 
724   PetscFunctionBegin;
725   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
726   /*---------------------------------------------------------------------------------------------*/
727   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
728   PetscCall(PetscMalloc1(am+2,&ci));
729   ci[0] = 0;
730 
731   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
732   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
733 
734   current_space = free_space;
735 
736   PetscCall(PetscHeapCreate(a->rmax,&h));
737   PetscCall(PetscMalloc1(a->rmax,&bb));
738   PetscCall(PetscBTCreate(bn,&bt));
739 
740   /* Determine ci and cj */
741   for (i=0; i<am; i++) {
742     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
743     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
744     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
745     ci[i+1] = ci[i];
746     /* Populate the min heap */
747     for (j=0; j<anzi; j++) {
748       PetscInt brow = acol[j];
749       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
750         PetscInt bcol = bj[bb[j]];
751         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
752           PetscCall(PetscHeapAdd(h,j,bcol));
753           bb[j]++;
754           break;
755         }
756       }
757     }
758     /* Pick off the min element, adding it to free space */
759     PetscCall(PetscHeapPop(h,&j,&col));
760     while (j >= 0) {
761       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
762         fptr = NULL;                      /* need PetscBTMemzero */
763         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
764         ndouble++;
765       }
766       *(current_space->array++) = col;
767       current_space->local_used++;
768       current_space->local_remaining--;
769       ci[i+1]++;
770 
771       /* stash if anything else remains in this row of B */
772       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
773         PetscInt bcol = bj[bb[j]];
774         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
775           PetscCall(PetscHeapAdd(h,j,bcol));
776           bb[j]++;
777           break;
778         }
779       }
780       PetscCall(PetscHeapPop(h,&j,&col));
781     }
782     if (fptr) {                 /* Clear the bits for this row */
783       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
784     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
785       PetscCall(PetscBTMemzero(bn,bt));
786     }
787   }
788   PetscCall(PetscFree(bb));
789   PetscCall(PetscHeapDestroy(&h));
790   PetscCall(PetscBTDestroy(&bt));
791 
792   /* Column indices are in the list of free space */
793   /* Allocate space for cj, initialize cj, and */
794   /* destroy list of free space and other temporary array(s) */
795   PetscCall(PetscMalloc1(ci[am],&cj));
796   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
797 
798   /* put together the new symbolic matrix */
799   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
800   PetscCall(MatSetBlockSizesFromMats(C,A,B));
801 
802   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
803   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
804   c          = (Mat_SeqAIJ*)(C->data);
805   c->free_a  = PETSC_TRUE;
806   c->free_ij = PETSC_TRUE;
807   c->nonew   = 0;
808 
809   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
810 
811   /* set MatInfo */
812   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
813   if (afill < 1.0) afill = 1.0;
814   C->info.mallocs           = ndouble;
815   C->info.fill_ratio_given  = fill;
816   C->info.fill_ratio_needed = afill;
817 
818 #if defined(PETSC_USE_INFO)
819   if (ci[am]) {
820     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
821     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
822   } else {
823     PetscCall(PetscInfo(C,"Empty matrix product\n"));
824   }
825 #endif
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
830 {
831   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
832   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
833   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
834   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
835   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
836   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
837   const PetscInt     *brow_ptr[8],*brow_end[8];
838   PetscInt           window[8];
839   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
840   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
841   PetscReal          afill;
842   PetscInt           *workj_L1,*workj_L2,*workj_L3;
843   PetscInt           L1_nnz,L2_nnz;
844 
845   /* Step 1: Get upper bound on memory required for allocation.
846              Because of the way virtual memory works,
847              only the memory pages that are actually needed will be physically allocated. */
848   PetscFunctionBegin;
849   PetscCall(PetscMalloc1(am+1,&ci));
850   for (i=0; i<am; i++) {
851     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
852     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
853     a_rownnz = 0;
854     for (k=0; k<anzi; ++k) {
855       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
856       if (a_rownnz > bn) {
857         a_rownnz = bn;
858         break;
859       }
860     }
861     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
862   }
863   /* temporary work areas for merging rows */
864   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
865   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
866   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
867 
868   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
869   c_maxmem = 8*(ai[am]+bi[bm]);
870   /* Step 2: Populate pattern for C */
871   PetscCall(PetscMalloc1(c_maxmem,&cj));
872 
873   ci_nnz       = 0;
874   ci[0]        = 0;
875   worki_L1[0]  = 0;
876   worki_L2[0]  = 0;
877   for (i=0; i<am; i++) {
878     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
879     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
880     rowsleft             = anzi;
881     inputcol_L1          = acol;
882     L2_nnz               = 0;
883     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
884     worki_L2[1]          = 0;
885     outputi_nnz          = 0;
886 
887     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
888     while (ci_nnz+a_maxrownnz > c_maxmem) {
889       c_maxmem *= 2;
890       ndouble++;
891       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
892     }
893 
894     while (rowsleft) {
895       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
896       L1_nrows    = 0;
897       L1_nnz      = 0;
898       inputcol    = inputcol_L1;
899       inputi      = bi;
900       inputj      = bj;
901 
902       /* The following macro is used to specialize for small rows in A.
903          This helps with compiler unrolling, improving performance substantially.
904           Input:  inputj   inputi  inputcol  bn
905           Output: outputj  outputi_nnz                       */
906        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
907          window_min  = bn;                                                   \
908          outputi_nnz = 0;                                                    \
909          for (k=0; k<ANNZ; ++k) {                                            \
910            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
911            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
912            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
913            window_min  = PetscMin(window[k], window_min);                    \
914          }                                                                   \
915          while (window_min < bn) {                                           \
916            outputj[outputi_nnz++] = window_min;                              \
917            /* advance front and compute new minimum */                       \
918            old_window_min = window_min;                                      \
919            window_min = bn;                                                  \
920            for (k=0; k<ANNZ; ++k) {                                          \
921              if (window[k] == old_window_min) {                              \
922                brow_ptr[k]++;                                                \
923                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
924              }                                                               \
925              window_min = PetscMin(window[k], window_min);                   \
926            }                                                                 \
927          }
928 
929       /************** L E V E L  1 ***************/
930       /* Merge up to 8 rows of B to L1 work array*/
931       while (L1_rowsleft) {
932         outputi_nnz = 0;
933         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
934         else           outputj = cj + ci_nnz;           /* Merge directly to C */
935 
936         switch (L1_rowsleft) {
937         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
938                  brow_end[0] = inputj + inputi[inputcol[0]+1];
939                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
940                  inputcol    += L1_rowsleft;
941                  rowsleft    -= L1_rowsleft;
942                  L1_rowsleft  = 0;
943                  break;
944         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
945                  inputcol    += L1_rowsleft;
946                  rowsleft    -= L1_rowsleft;
947                  L1_rowsleft  = 0;
948                  break;
949         case 3: MatMatMultSymbolic_RowMergeMacro(3);
950                  inputcol    += L1_rowsleft;
951                  rowsleft    -= L1_rowsleft;
952                  L1_rowsleft  = 0;
953                  break;
954         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
955                  inputcol    += L1_rowsleft;
956                  rowsleft    -= L1_rowsleft;
957                  L1_rowsleft  = 0;
958                  break;
959         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
960                  inputcol    += L1_rowsleft;
961                  rowsleft    -= L1_rowsleft;
962                  L1_rowsleft  = 0;
963                  break;
964         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
965                  inputcol    += L1_rowsleft;
966                  rowsleft    -= L1_rowsleft;
967                  L1_rowsleft  = 0;
968                  break;
969         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
970                  inputcol    += L1_rowsleft;
971                  rowsleft    -= L1_rowsleft;
972                  L1_rowsleft  = 0;
973                  break;
974         default: MatMatMultSymbolic_RowMergeMacro(8);
975                  inputcol    += 8;
976                  rowsleft    -= 8;
977                  L1_rowsleft -= 8;
978                  break;
979         }
980         inputcol_L1           = inputcol;
981         L1_nnz               += outputi_nnz;
982         worki_L1[++L1_nrows]  = L1_nnz;
983       }
984 
985       /********************** L E V E L  2 ************************/
986       /* Merge from L1 work array to either C or to L2 work array */
987       if (anzi > 8) {
988         inputi      = worki_L1;
989         inputj      = workj_L1;
990         inputcol    = workcol;
991         outputi_nnz = 0;
992 
993         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
994         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
995 
996         switch (L1_nrows) {
997         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
998                  brow_end[0] = inputj + inputi[inputcol[0]+1];
999                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1000                  break;
1001         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1002         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1003         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1004         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1005         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1006         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1007         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1008         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1009         }
1010         L2_nnz               += outputi_nnz;
1011         worki_L2[++L2_nrows]  = L2_nnz;
1012 
1013         /************************ L E V E L  3 **********************/
1014         /* Merge from L2 work array to either C or to L2 work array */
1015         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1016           inputi      = worki_L2;
1017           inputj      = workj_L2;
1018           inputcol    = workcol;
1019           outputi_nnz = 0;
1020           if (rowsleft) outputj = workj_L3;
1021           else          outputj = cj + ci_nnz;
1022           switch (L2_nrows) {
1023           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1024                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1025                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1026                    break;
1027           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1028           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1029           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1030           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1031           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1032           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1033           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1034           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1035           }
1036           L2_nrows    = 1;
1037           L2_nnz      = outputi_nnz;
1038           worki_L2[1] = outputi_nnz;
1039           /* Copy to workj_L2 */
1040           if (rowsleft) {
1041             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1042           }
1043         }
1044       }
1045     }  /* while (rowsleft) */
1046 #undef MatMatMultSymbolic_RowMergeMacro
1047 
1048     /* terminate current row */
1049     ci_nnz += outputi_nnz;
1050     ci[i+1] = ci_nnz;
1051   }
1052 
1053   /* Step 3: Create the new symbolic matrix */
1054   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1055   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1056 
1057   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1058   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1059   c          = (Mat_SeqAIJ*)(C->data);
1060   c->free_a  = PETSC_TRUE;
1061   c->free_ij = PETSC_TRUE;
1062   c->nonew   = 0;
1063 
1064   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1065 
1066   /* set MatInfo */
1067   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1068   if (afill < 1.0) afill = 1.0;
1069   C->info.mallocs           = ndouble;
1070   C->info.fill_ratio_given  = fill;
1071   C->info.fill_ratio_needed = afill;
1072 
1073 #if defined(PETSC_USE_INFO)
1074   if (ci[am]) {
1075     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1076     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1077   } else {
1078     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1079   }
1080 #endif
1081 
1082   /* Step 4: Free temporary work areas */
1083   PetscCall(PetscFree(workj_L1));
1084   PetscCall(PetscFree(workj_L2));
1085   PetscCall(PetscFree(workj_L3));
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* concatenate unique entries and then sort */
1090 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1091 {
1092   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1093   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1094   PetscInt       *ci,*cj,bcol;
1095   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1096   PetscReal      afill;
1097   PetscInt       i,j,ndouble = 0;
1098   PetscSegBuffer seg,segrow;
1099   char           *seen;
1100 
1101   PetscFunctionBegin;
1102   PetscCall(PetscMalloc1(am+1,&ci));
1103   ci[0] = 0;
1104 
1105   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1106   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1107   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1108   PetscCall(PetscCalloc1(bn,&seen));
1109 
1110   /* Determine ci and cj */
1111   for (i=0; i<am; i++) {
1112     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1113     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1114     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1115 
1116     /* Pack segrow */
1117     for (j=0; j<anzi; j++) {
1118       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1119       for (k=bjstart; k<bjend; k++) {
1120         bcol = bj[k];
1121         if (!seen[bcol]) { /* new entry */
1122           PetscInt *PETSC_RESTRICT slot;
1123           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1124           *slot = bcol;
1125           seen[bcol] = 1;
1126           packlen++;
1127         }
1128       }
1129     }
1130 
1131     /* Check i-th diagonal entry */
1132     if (C->force_diagonals && !seen[i]) {
1133       PetscInt *PETSC_RESTRICT slot;
1134       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1135       *slot   = i;
1136       seen[i] = 1;
1137       packlen++;
1138     }
1139 
1140     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1141     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1142     PetscCall(PetscSortInt(packlen,crow));
1143     ci[i+1] = ci[i] + packlen;
1144     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1145   }
1146   PetscCall(PetscSegBufferDestroy(&segrow));
1147   PetscCall(PetscFree(seen));
1148 
1149   /* Column indices are in the segmented buffer */
1150   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1151   PetscCall(PetscSegBufferDestroy(&seg));
1152 
1153   /* put together the new symbolic matrix */
1154   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1155   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1156 
1157   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1158   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1159   c          = (Mat_SeqAIJ*)(C->data);
1160   c->free_a  = PETSC_TRUE;
1161   c->free_ij = PETSC_TRUE;
1162   c->nonew   = 0;
1163 
1164   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1165 
1166   /* set MatInfo */
1167   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1168   if (afill < 1.0) afill = 1.0;
1169   C->info.mallocs           = ndouble;
1170   C->info.fill_ratio_given  = fill;
1171   C->info.fill_ratio_needed = afill;
1172 
1173 #if defined(PETSC_USE_INFO)
1174   if (ci[am]) {
1175     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1176     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1177   } else {
1178     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1179   }
1180 #endif
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1185 {
1186   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1190   PetscCall(MatDestroy(&abt->Bt_den));
1191   PetscCall(MatDestroy(&abt->ABt_den));
1192   PetscCall(PetscFree(abt));
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1197 {
1198   Mat                 Bt;
1199   PetscInt            *bti,*btj;
1200   Mat_MatMatTransMult *abt;
1201   Mat_Product         *product = C->product;
1202   char                *alg;
1203 
1204   PetscFunctionBegin;
1205   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1206   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1207 
1208   /* create symbolic Bt */
1209   PetscCall(MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1210   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt));
1211   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1212   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1213 
1214   /* get symbolic C=A*Bt */
1215   PetscCall(PetscStrallocpy(product->alg,&alg));
1216   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1217   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1218   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1219   PetscCall(PetscFree(alg));
1220 
1221   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1222   PetscCall(PetscNew(&abt));
1223 
1224   product->data    = abt;
1225   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1226 
1227   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1228 
1229   abt->usecoloring = PETSC_FALSE;
1230   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1231   if (abt->usecoloring) {
1232     /* Create MatTransposeColoring from symbolic C=A*B^T */
1233     MatTransposeColoring matcoloring;
1234     MatColoring          coloring;
1235     ISColoring           iscoloring;
1236     Mat                  Bt_dense,C_dense;
1237 
1238     /* inode causes memory problem */
1239     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1240 
1241     PetscCall(MatColoringCreate(C,&coloring));
1242     PetscCall(MatColoringSetDistance(coloring,2));
1243     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1244     PetscCall(MatColoringSetFromOptions(coloring));
1245     PetscCall(MatColoringApply(coloring,&iscoloring));
1246     PetscCall(MatColoringDestroy(&coloring));
1247     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1248 
1249     abt->matcoloring = matcoloring;
1250 
1251     PetscCall(ISColoringDestroy(&iscoloring));
1252 
1253     /* Create Bt_dense and C_dense = A*Bt_dense */
1254     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1255     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1256     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1257     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1258 
1259     Bt_dense->assembled = PETSC_TRUE;
1260     abt->Bt_den         = Bt_dense;
1261 
1262     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1263     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1264     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1265     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1266 
1267     Bt_dense->assembled = PETSC_TRUE;
1268     abt->ABt_den  = C_dense;
1269 
1270 #if defined(PETSC_USE_INFO)
1271     {
1272       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1273       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1274     }
1275 #endif
1276   }
1277   /* clean up */
1278   PetscCall(MatDestroy(&Bt));
1279   PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1284 {
1285   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1286   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1287   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1288   PetscLogDouble      flops=0.0;
1289   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1290   Mat_MatMatTransMult *abt;
1291   Mat_Product         *product = C->product;
1292 
1293   PetscFunctionBegin;
1294   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1295   abt = (Mat_MatMatTransMult *)product->data;
1296   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1297   /* clear old values in C */
1298   if (!c->a) {
1299     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1300     c->a      = ca;
1301     c->free_a = PETSC_TRUE;
1302   } else {
1303     ca =  c->a;
1304     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1305   }
1306 
1307   if (abt->usecoloring) {
1308     MatTransposeColoring matcoloring = abt->matcoloring;
1309     Mat                  Bt_dense,C_dense = abt->ABt_den;
1310 
1311     /* Get Bt_dense by Apply MatTransposeColoring to B */
1312     Bt_dense = abt->Bt_den;
1313     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1314 
1315     /* C_dense = A*Bt_dense */
1316     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1317 
1318     /* Recover C from C_dense */
1319     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1320     PetscFunctionReturn(0);
1321   }
1322 
1323   for (i=0; i<cm; i++) {
1324     anzi = ai[i+1] - ai[i];
1325     acol = aj + ai[i];
1326     aval = aa + ai[i];
1327     cnzi = ci[i+1] - ci[i];
1328     ccol = cj + ci[i];
1329     cval = ca + ci[i];
1330     for (j=0; j<cnzi; j++) {
1331       brow = ccol[j];
1332       bnzj = bi[brow+1] - bi[brow];
1333       bcol = bj + bi[brow];
1334       bval = ba + bi[brow];
1335 
1336       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1337       nexta = 0; nextb = 0;
1338       while (nexta<anzi && nextb<bnzj) {
1339         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1340         if (nexta == anzi) break;
1341         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1342         if (nextb == bnzj) break;
1343         if (acol[nexta] == bcol[nextb]) {
1344           cval[j] += aval[nexta]*bval[nextb];
1345           nexta++; nextb++;
1346           flops += 2;
1347         }
1348       }
1349     }
1350   }
1351   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1352   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1353   PetscCall(PetscLogFlops(flops));
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1358 {
1359   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1360 
1361   PetscFunctionBegin;
1362   PetscCall(MatDestroy(&atb->At));
1363   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1364   PetscCall(PetscFree(atb));
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1369 {
1370   Mat            At = NULL;
1371   PetscInt       *ati,*atj;
1372   Mat_Product    *product = C->product;
1373   PetscBool      flg,def,square;
1374 
1375   PetscFunctionBegin;
1376   MatCheckProduct(C,4);
1377   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1378   /* outerproduct */
1379   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1380   if (flg) {
1381     /* create symbolic At */
1382     if (!square) {
1383       PetscCall(MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1384       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At));
1385       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1386       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1387     }
1388     /* get symbolic C=At*B */
1389     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1390     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1391 
1392     /* clean up */
1393     if (!square) {
1394       PetscCall(MatDestroy(&At));
1395       PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1396     }
1397 
1398     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1399     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1400     PetscFunctionReturn(0);
1401   }
1402 
1403   /* matmatmult */
1404   PetscCall(PetscStrcmp(product->alg,"default",&def));
1405   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1406   if (flg || def) {
1407     Mat_MatTransMatMult *atb;
1408 
1409     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1410     PetscCall(PetscNew(&atb));
1411     if (!square) {
1412       PetscCall(MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At));
1413     }
1414     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1415     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1416     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1417     product->data    = atb;
1418     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1419     atb->At          = At;
1420     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
1421 
1422     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1423     PetscFunctionReturn(0);
1424   }
1425 
1426   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1427 }
1428 
1429 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1430 {
1431   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1432   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1433   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1434   PetscLogDouble flops=0.0;
1435   MatScalar      *aa=a->a,*ba,*ca,*caj;
1436 
1437   PetscFunctionBegin;
1438   if (!c->a) {
1439     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1440 
1441     c->a      = ca;
1442     c->free_a = PETSC_TRUE;
1443   } else {
1444     ca   = c->a;
1445     PetscCall(PetscArrayzero(ca,ci[cm]));
1446   }
1447 
1448   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1449   for (i=0; i<am; i++) {
1450     bj   = b->j + bi[i];
1451     ba   = b->a + bi[i];
1452     bnzi = bi[i+1] - bi[i];
1453     anzi = ai[i+1] - ai[i];
1454     for (j=0; j<anzi; j++) {
1455       nextb = 0;
1456       crow  = *aj++;
1457       cjj   = cj + ci[crow];
1458       caj   = ca + ci[crow];
1459       /* perform sparse axpy operation.  Note cjj includes bj. */
1460       for (k=0; nextb<bnzi; k++) {
1461         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1462           caj[k] += (*aa)*(*(ba+nextb));
1463           nextb++;
1464         }
1465       }
1466       flops += 2*bnzi;
1467       aa++;
1468     }
1469   }
1470 
1471   /* Assemble the final matrix and clean up */
1472   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1473   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1474   PetscCall(PetscLogFlops(flops));
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1479 {
1480   PetscFunctionBegin;
1481   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1483   PetscFunctionReturn(0);
1484 }
1485 
1486 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1487 {
1488   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1489   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1490   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1491   const PetscInt    *aj;
1492   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1493   PetscInt          clda;
1494   PetscInt          am4,bm4,col,i,j,n;
1495 
1496   PetscFunctionBegin;
1497   if (!cm || !cn) PetscFunctionReturn(0);
1498   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1499   if (add) {
1500     PetscCall(MatDenseGetArray(C,&c));
1501   } else {
1502     PetscCall(MatDenseGetArrayWrite(C,&c));
1503   }
1504   PetscCall(MatDenseGetArrayRead(B,&b));
1505   PetscCall(MatDenseGetLDA(B,&bm));
1506   PetscCall(MatDenseGetLDA(C,&clda));
1507   am4 = 4*clda;
1508   bm4 = 4*bm;
1509   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1510   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1511   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1512     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1513       r1 = r2 = r3 = r4 = 0.0;
1514       n  = a->i[i+1] - a->i[i];
1515       aj = a->j + a->i[i];
1516       aa = av + a->i[i];
1517       for (j=0; j<n; j++) {
1518         const PetscScalar aatmp = aa[j];
1519         const PetscInt    ajtmp = aj[j];
1520         r1 += aatmp*b1[ajtmp];
1521         r2 += aatmp*b2[ajtmp];
1522         r3 += aatmp*b3[ajtmp];
1523         r4 += aatmp*b4[ajtmp];
1524       }
1525       if (add) {
1526         c1[i] += r1;
1527         c2[i] += r2;
1528         c3[i] += r3;
1529         c4[i] += r4;
1530       } else {
1531         c1[i] = r1;
1532         c2[i] = r2;
1533         c3[i] = r3;
1534         c4[i] = r4;
1535       }
1536     }
1537     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1538     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1539   }
1540   /* process remaining columns */
1541   if (col != cn) {
1542     PetscInt rc = cn-col;
1543 
1544     if (rc == 1) {
1545       for (i=0; i<am; i++) {
1546         r1 = 0.0;
1547         n  = a->i[i+1] - a->i[i];
1548         aj = a->j + a->i[i];
1549         aa = av + a->i[i];
1550         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1551         if (add) c1[i] += r1;
1552         else c1[i] = r1;
1553       }
1554     } else if (rc == 2) {
1555       for (i=0; i<am; i++) {
1556         r1 = r2 = 0.0;
1557         n  = a->i[i+1] - a->i[i];
1558         aj = a->j + a->i[i];
1559         aa = av + a->i[i];
1560         for (j=0; j<n; j++) {
1561           const PetscScalar aatmp = aa[j];
1562           const PetscInt    ajtmp = aj[j];
1563           r1 += aatmp*b1[ajtmp];
1564           r2 += aatmp*b2[ajtmp];
1565         }
1566         if (add) {
1567           c1[i] += r1;
1568           c2[i] += r2;
1569         } else {
1570           c1[i] = r1;
1571           c2[i] = r2;
1572         }
1573       }
1574     } else {
1575       for (i=0; i<am; i++) {
1576         r1 = r2 = r3 = 0.0;
1577         n  = a->i[i+1] - a->i[i];
1578         aj = a->j + a->i[i];
1579         aa = av + a->i[i];
1580         for (j=0; j<n; j++) {
1581           const PetscScalar aatmp = aa[j];
1582           const PetscInt    ajtmp = aj[j];
1583           r1 += aatmp*b1[ajtmp];
1584           r2 += aatmp*b2[ajtmp];
1585           r3 += aatmp*b3[ajtmp];
1586         }
1587         if (add) {
1588           c1[i] += r1;
1589           c2[i] += r2;
1590           c3[i] += r3;
1591         } else {
1592           c1[i] = r1;
1593           c2[i] = r2;
1594           c3[i] = r3;
1595         }
1596       }
1597     }
1598   }
1599   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1600   if (add) {
1601     PetscCall(MatDenseRestoreArray(C,&c));
1602   } else {
1603     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1604   }
1605   PetscCall(MatDenseRestoreArrayRead(B,&b));
1606   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1611 {
1612   PetscFunctionBegin;
1613   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1614   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1615   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1616 
1617   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 /* ------------------------------------------------------- */
1622 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1623 {
1624   PetscFunctionBegin;
1625   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1626   C->ops->productsymbolic = MatProductSymbolic_AB;
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1631 
1632 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1633 {
1634   PetscFunctionBegin;
1635   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1636   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1641 {
1642   PetscFunctionBegin;
1643   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1644   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1649 {
1650   Mat_Product    *product = C->product;
1651 
1652   PetscFunctionBegin;
1653   switch (product->type) {
1654   case MATPRODUCT_AB:
1655     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1656     break;
1657   case MATPRODUCT_AtB:
1658     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1659     break;
1660   case MATPRODUCT_ABt:
1661     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1662     break;
1663   default:
1664     break;
1665   }
1666   PetscFunctionReturn(0);
1667 }
1668 /* ------------------------------------------------------- */
1669 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1670 {
1671   Mat_Product    *product = C->product;
1672   Mat            A = product->A;
1673   PetscBool      baij;
1674 
1675   PetscFunctionBegin;
1676   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1677   if (!baij) { /* A is seqsbaij */
1678     PetscBool sbaij;
1679     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1680     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1681 
1682     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1683   } else { /* A is seqbaij */
1684     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1685   }
1686 
1687   C->ops->productsymbolic = MatProductSymbolic_AB;
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1692 {
1693   Mat_Product    *product = C->product;
1694 
1695   PetscFunctionBegin;
1696   MatCheckProduct(C,1);
1697   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1698   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 /* ------------------------------------------------------- */
1703 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1704 {
1705   PetscFunctionBegin;
1706   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1707   C->ops->productsymbolic = MatProductSymbolic_AB;
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1712 {
1713   Mat_Product    *product = C->product;
1714 
1715   PetscFunctionBegin;
1716   if (product->type == MATPRODUCT_AB) {
1717     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1718   }
1719   PetscFunctionReturn(0);
1720 }
1721 /* ------------------------------------------------------- */
1722 
1723 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1724 {
1725   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1726   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1727   PetscInt       *bi      = b->i,*bj=b->j;
1728   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1729   MatScalar      *btval,*btval_den,*ba=b->a;
1730   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1731 
1732   PetscFunctionBegin;
1733   btval_den=btdense->v;
1734   PetscCall(PetscArrayzero(btval_den,m*n));
1735   for (k=0; k<ncolors; k++) {
1736     ncolumns = coloring->ncolumns[k];
1737     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1738       col   = *(columns + colorforcol[k] + l);
1739       btcol = bj + bi[col];
1740       btval = ba + bi[col];
1741       anz   = bi[col+1] - bi[col];
1742       for (j=0; j<anz; j++) {
1743         brow            = btcol[j];
1744         btval_den[brow] = btval[j];
1745       }
1746     }
1747     btval_den += m;
1748   }
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1753 {
1754   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1755   const PetscScalar *ca_den,*ca_den_ptr;
1756   PetscScalar       *ca=csp->a;
1757   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1758   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1759   PetscInt          nrows,*row,*idx;
1760   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1761 
1762   PetscFunctionBegin;
1763   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1764 
1765   if (brows > 0) {
1766     PetscInt *lstart,row_end,row_start;
1767     lstart = matcoloring->lstart;
1768     PetscCall(PetscArrayzero(lstart,ncolors));
1769 
1770     row_end = brows;
1771     if (row_end > m) row_end = m;
1772     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1773       ca_den_ptr = ca_den;
1774       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1775         nrows = matcoloring->nrows[k];
1776         row   = rows  + colorforrow[k];
1777         idx   = den2sp + colorforrow[k];
1778         for (l=lstart[k]; l<nrows; l++) {
1779           if (row[l] >= row_end) {
1780             lstart[k] = l;
1781             break;
1782           } else {
1783             ca[idx[l]] = ca_den_ptr[row[l]];
1784           }
1785         }
1786         ca_den_ptr += m;
1787       }
1788       row_end += brows;
1789       if (row_end > m) row_end = m;
1790     }
1791   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1792     ca_den_ptr = ca_den;
1793     for (k=0; k<ncolors; k++) {
1794       nrows = matcoloring->nrows[k];
1795       row   = rows  + colorforrow[k];
1796       idx   = den2sp + colorforrow[k];
1797       for (l=0; l<nrows; l++) {
1798         ca[idx[l]] = ca_den_ptr[row[l]];
1799       }
1800       ca_den_ptr += m;
1801     }
1802   }
1803 
1804   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1805 #if defined(PETSC_USE_INFO)
1806   if (matcoloring->brows > 0) {
1807     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1808   } else {
1809     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1810   }
1811 #endif
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1816 {
1817   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1818   const PetscInt *is,*ci,*cj,*row_idx;
1819   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1820   IS             *isa;
1821   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1822   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1823   PetscInt       *colorforcol,*columns,*columns_i,brows;
1824   PetscBool      flg;
1825 
1826   PetscFunctionBegin;
1827   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1828 
1829   /* bs >1 is not being tested yet! */
1830   Nbs       = mat->cmap->N/bs;
1831   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1832   c->N      = Nbs;
1833   c->m      = c->M;
1834   c->rstart = 0;
1835   c->brows  = 100;
1836 
1837   c->ncolors = nis;
1838   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1839   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1840   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1841 
1842   brows = c->brows;
1843   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1844   if (flg) c->brows = brows;
1845   if (brows > 0) {
1846     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1847   }
1848 
1849   colorforrow[0] = 0;
1850   rows_i         = rows;
1851   den2sp_i       = den2sp;
1852 
1853   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1854   PetscCall(PetscMalloc1(Nbs+1,&columns));
1855 
1856   colorforcol[0] = 0;
1857   columns_i      = columns;
1858 
1859   /* get column-wise storage of mat */
1860   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1861 
1862   cm   = c->m;
1863   PetscCall(PetscMalloc1(cm+1,&rowhit));
1864   PetscCall(PetscMalloc1(cm+1,&idxhit));
1865   for (i=0; i<nis; i++) { /* loop over color */
1866     PetscCall(ISGetLocalSize(isa[i],&n));
1867     PetscCall(ISGetIndices(isa[i],&is));
1868 
1869     c->ncolumns[i] = n;
1870     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1871     colorforcol[i+1] = colorforcol[i] + n;
1872     columns_i       += n;
1873 
1874     /* fast, crude version requires O(N*N) work */
1875     PetscCall(PetscArrayzero(rowhit,cm));
1876 
1877     for (j=0; j<n; j++) { /* loop over columns*/
1878       col     = is[j];
1879       row_idx = cj + ci[col];
1880       m       = ci[col+1] - ci[col];
1881       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1882         idxhit[*row_idx]   = spidx[ci[col] + k];
1883         rowhit[*row_idx++] = col + 1;
1884       }
1885     }
1886     /* count the number of hits */
1887     nrows = 0;
1888     for (j=0; j<cm; j++) {
1889       if (rowhit[j]) nrows++;
1890     }
1891     c->nrows[i]      = nrows;
1892     colorforrow[i+1] = colorforrow[i] + nrows;
1893 
1894     nrows = 0;
1895     for (j=0; j<cm; j++) { /* loop over rows */
1896       if (rowhit[j]) {
1897         rows_i[nrows]   = j;
1898         den2sp_i[nrows] = idxhit[j];
1899         nrows++;
1900       }
1901     }
1902     den2sp_i += nrows;
1903 
1904     PetscCall(ISRestoreIndices(isa[i],&is));
1905     rows_i += nrows;
1906   }
1907   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1908   PetscCall(PetscFree(rowhit));
1909   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1910   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1911 
1912   c->colorforrow = colorforrow;
1913   c->rows        = rows;
1914   c->den2sp      = den2sp;
1915   c->colorforcol = colorforcol;
1916   c->columns     = columns;
1917 
1918   PetscCall(PetscFree(idxhit));
1919   PetscFunctionReturn(0);
1920 }
1921 
1922 /* --------------------------------------------------------------- */
1923 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1924 {
1925   Mat_Product    *product = C->product;
1926   Mat            A=product->A,B=product->B;
1927 
1928   PetscFunctionBegin;
1929   if (C->ops->mattransposemultnumeric) {
1930     /* Alg: "outerproduct" */
1931     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1932   } else {
1933     /* Alg: "matmatmult" -- C = At*B */
1934     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1935     Mat                 At;
1936 
1937     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1938     At = atb->At;
1939     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1940       PetscCall(MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At));
1941     }
1942     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C));
1943     atb->updateAt = PETSC_TRUE;
1944   }
1945   PetscFunctionReturn(0);
1946 }
1947 
1948 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1949 {
1950   Mat_Product    *product = C->product;
1951   Mat            A=product->A,B=product->B;
1952   PetscReal      fill=product->fill;
1953 
1954   PetscFunctionBegin;
1955   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1956 
1957   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1958   PetscFunctionReturn(0);
1959 }
1960 
1961 /* --------------------------------------------------------------- */
1962 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1963 {
1964   Mat_Product    *product = C->product;
1965   PetscInt       alg = 0; /* default algorithm */
1966   PetscBool      flg = PETSC_FALSE;
1967 #if !defined(PETSC_HAVE_HYPRE)
1968   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1969   PetscInt       nalg = 7;
1970 #else
1971   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1972   PetscInt       nalg = 8;
1973 #endif
1974 
1975   PetscFunctionBegin;
1976   /* Set default algorithm */
1977   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1978   if (flg) {
1979     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1980   }
1981 
1982   /* Get runtime option */
1983   if (product->api_user) {
1984     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1985     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1986     PetscOptionsEnd();
1987   } else {
1988     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1989     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1990     PetscOptionsEnd();
1991   }
1992   if (flg) {
1993     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1994   }
1995 
1996   C->ops->productsymbolic = MatProductSymbolic_AB;
1997   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1998   PetscFunctionReturn(0);
1999 }
2000 
2001 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2002 {
2003   Mat_Product    *product = C->product;
2004   PetscInt       alg = 0; /* default algorithm */
2005   PetscBool      flg = PETSC_FALSE;
2006   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2007   PetscInt       nalg = 3;
2008 
2009   PetscFunctionBegin;
2010   /* Get runtime option */
2011   if (product->api_user) {
2012     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2013     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2014     PetscOptionsEnd();
2015   } else {
2016     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2017     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2018     PetscOptionsEnd();
2019   }
2020   if (flg) {
2021     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2022   }
2023 
2024   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2025   PetscFunctionReturn(0);
2026 }
2027 
2028 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2029 {
2030   Mat_Product    *product = C->product;
2031   PetscInt       alg = 0; /* default algorithm */
2032   PetscBool      flg = PETSC_FALSE;
2033   const char     *algTypes[2] = {"default","color"};
2034   PetscInt       nalg = 2;
2035 
2036   PetscFunctionBegin;
2037   /* Set default algorithm */
2038   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2039   if (!flg) {
2040     alg = 1;
2041     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2042   }
2043 
2044   /* Get runtime option */
2045   if (product->api_user) {
2046     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2047     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2048     PetscOptionsEnd();
2049   } else {
2050     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2051     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2052     PetscOptionsEnd();
2053   }
2054   if (flg) {
2055     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2056   }
2057 
2058   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2059   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2064 {
2065   Mat_Product    *product = C->product;
2066   PetscBool      flg = PETSC_FALSE;
2067   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2068 #if !defined(PETSC_HAVE_HYPRE)
2069   const char     *algTypes[2] = {"scalable","rap"};
2070   PetscInt       nalg = 2;
2071 #else
2072   const char     *algTypes[3] = {"scalable","rap","hypre"};
2073   PetscInt       nalg = 3;
2074 #endif
2075 
2076   PetscFunctionBegin;
2077   /* Set default algorithm */
2078   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2079   if (flg) {
2080     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2081   }
2082 
2083   /* Get runtime option */
2084   if (product->api_user) {
2085     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2086     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2087     PetscOptionsEnd();
2088   } else {
2089     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2090     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2091     PetscOptionsEnd();
2092   }
2093   if (flg) {
2094     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2095   }
2096 
2097   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2098   PetscFunctionReturn(0);
2099 }
2100 
2101 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2102 {
2103   Mat_Product    *product = C->product;
2104   PetscBool      flg = PETSC_FALSE;
2105   PetscInt       alg = 0; /* default algorithm */
2106   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2107   PetscInt        nalg = 3;
2108 
2109   PetscFunctionBegin;
2110   /* Set default algorithm */
2111   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2112   if (flg) {
2113     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2114   }
2115 
2116   /* Get runtime option */
2117   if (product->api_user) {
2118     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2119     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2120     PetscOptionsEnd();
2121   } else {
2122     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2123     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2124     PetscOptionsEnd();
2125   }
2126   if (flg) {
2127     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2128   }
2129 
2130   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2131   PetscFunctionReturn(0);
2132 }
2133 
2134 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2135 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2136 {
2137   Mat_Product    *product = C->product;
2138   PetscInt       alg = 0; /* default algorithm */
2139   PetscBool      flg = PETSC_FALSE;
2140   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2141   PetscInt       nalg = 7;
2142 
2143   PetscFunctionBegin;
2144   /* Set default algorithm */
2145   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2146   if (flg) {
2147     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2148   }
2149 
2150   /* Get runtime option */
2151   if (product->api_user) {
2152     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2153     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2154     PetscOptionsEnd();
2155   } else {
2156     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2157     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2158     PetscOptionsEnd();
2159   }
2160   if (flg) {
2161     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2162   }
2163 
2164   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2165   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2166   PetscFunctionReturn(0);
2167 }
2168 
2169 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2170 {
2171   Mat_Product    *product = C->product;
2172 
2173   PetscFunctionBegin;
2174   switch (product->type) {
2175   case MATPRODUCT_AB:
2176     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2177     break;
2178   case MATPRODUCT_AtB:
2179     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2180     break;
2181   case MATPRODUCT_ABt:
2182     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2183     break;
2184   case MATPRODUCT_PtAP:
2185     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2186     break;
2187   case MATPRODUCT_RARt:
2188     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2189     break;
2190   case MATPRODUCT_ABC:
2191     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2192     break;
2193   default:
2194     break;
2195   }
2196   PetscFunctionReturn(0);
2197 }
2198