xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 8fb5bd83c3955fefcf33a54e3bb66920a9fa884b)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) {
17     PetscCheck(C->ops->matmultnumeric != MatMatMultNumeric_SeqAIJ_SeqAIJ,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
18     PetscCall((*C->ops->matmultnumeric)(A,B,C));
19   } else {
20     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
21   }
22   PetscFunctionReturn(0);
23 }
24 
25 /* Modified from MatCreateSeqAIJWithArrays() */
26 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
27 {
28   PetscInt       ii;
29   Mat_SeqAIJ     *aij;
30   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
31 
32   PetscFunctionBegin;
33   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
34   PetscCall(MatSetSizes(mat,m,n,m,n));
35 
36   if (!mtype) {
37     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
38     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
39   } else {
40     PetscCall(MatSetType(mat,mtype));
41   }
42 
43   aij  = (Mat_SeqAIJ*)(mat)->data;
44   osingle = aij->singlemalloc;
45   ofree_a = aij->free_a;
46   ofree_ij = aij->free_ij;
47   /* changes the free flags */
48   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
49 
50   PetscCall(PetscFree(aij->ilen));
51   PetscCall(PetscFree(aij->imax));
52   PetscCall(PetscMalloc1(m,&aij->imax));
53   PetscCall(PetscMalloc1(m,&aij->ilen));
54   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
55     const PetscInt rnz = i[ii+1] - i[ii];
56     aij->nonzerorowcnt += !!rnz;
57     aij->rmax = PetscMax(aij->rmax,rnz);
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60   aij->maxnz = i[m];
61   aij->nz = i[m];
62 
63   if (osingle) {
64     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
65   } else {
66     if (ofree_a)  PetscCall(PetscFree(aij->a));
67     if (ofree_ij) PetscCall(PetscFree(aij->j));
68     if (ofree_ij) PetscCall(PetscFree(aij->i));
69   }
70   aij->i            = i;
71   aij->j            = j;
72   aij->a            = a;
73   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
74   /* default to not retain ownership */
75   aij->singlemalloc = PETSC_FALSE;
76   aij->free_a       = PETSC_FALSE;
77   aij->free_ij      = PETSC_FALSE;
78   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
79   PetscFunctionReturn(0);
80 }
81 
82 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
83 {
84   Mat_Product         *product = C->product;
85   MatProductAlgorithm alg;
86   PetscBool           flg;
87 
88   PetscFunctionBegin;
89   if (product) {
90     alg = product->alg;
91   } else {
92     alg = "sorted";
93   }
94   /* sorted */
95   PetscCall(PetscStrcmp(alg,"sorted",&flg));
96   if (flg) {
97     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
98     PetscFunctionReturn(0);
99   }
100 
101   /* scalable */
102   PetscCall(PetscStrcmp(alg,"scalable",&flg));
103   if (flg) {
104     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
105     PetscFunctionReturn(0);
106   }
107 
108   /* scalable_fast */
109   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
110   if (flg) {
111     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
112     PetscFunctionReturn(0);
113   }
114 
115   /* heap */
116   PetscCall(PetscStrcmp(alg,"heap",&flg));
117   if (flg) {
118     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
119     PetscFunctionReturn(0);
120   }
121 
122   /* btheap */
123   PetscCall(PetscStrcmp(alg,"btheap",&flg));
124   if (flg) {
125     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
126     PetscFunctionReturn(0);
127   }
128 
129   /* llcondensed */
130   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
131   if (flg) {
132     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
133     PetscFunctionReturn(0);
134   }
135 
136   /* rowmerge */
137   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
138   if (flg) {
139     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
140     PetscFunctionReturn(0);
141   }
142 
143 #if defined(PETSC_HAVE_HYPRE)
144   PetscCall(PetscStrcmp(alg,"hypre",&flg));
145   if (flg) {
146     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
147     PetscFunctionReturn(0);
148   }
149 #endif
150 
151   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
152 }
153 
154 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
155 {
156   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
157   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
158   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
159   PetscReal          afill;
160   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
161   PetscTable         ta;
162   PetscBT            lnkbt;
163   PetscFreeSpaceList free_space=NULL,current_space=NULL;
164 
165   PetscFunctionBegin;
166   /* Get ci and cj */
167   /*---------------*/
168   /* Allocate ci array, arrays for fill computation and */
169   /* free space for accumulating nonzero column info */
170   PetscCall(PetscMalloc1(am+2,&ci));
171   ci[0] = 0;
172 
173   /* create and initialize a linked list */
174   PetscCall(PetscTableCreate(bn,bn,&ta));
175   MatRowMergeMax_SeqAIJ(b,bm,ta);
176   PetscCall(PetscTableGetCount(ta,&Crmax));
177   PetscCall(PetscTableDestroy(&ta));
178 
179   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
180 
181   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
182   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
183 
184   current_space = free_space;
185 
186   /* Determine ci and cj */
187   for (i=0; i<am; i++) {
188     anzi = ai[i+1] - ai[i];
189     aj   = a->j + ai[i];
190     for (j=0; j<anzi; j++) {
191       brow = aj[j];
192       bnzj = bi[brow+1] - bi[brow];
193       bj   = b->j + bi[brow];
194       /* add non-zero cols of B into the sorted linked list lnk */
195       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
196     }
197     /* add possible missing diagonal entry */
198     if (C->force_diagonals) {
199       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
200     }
201     cnzi = lnk[0];
202 
203     /* If free space is not available, make more free space */
204     /* Double the amount of total space in the list */
205     if (current_space->local_remaining<cnzi) {
206       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
207       ndouble++;
208     }
209 
210     /* Copy data into free space, then initialize lnk */
211     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
212 
213     current_space->array           += cnzi;
214     current_space->local_used      += cnzi;
215     current_space->local_remaining -= cnzi;
216 
217     ci[i+1] = ci[i] + cnzi;
218   }
219 
220   /* Column indices are in the list of free space */
221   /* Allocate space for cj, initialize cj, and */
222   /* destroy list of free space and other temporary array(s) */
223   PetscCall(PetscMalloc1(ci[am]+1,&cj));
224   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
225   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
226 
227   /* put together the new symbolic matrix */
228   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
229   PetscCall(MatSetBlockSizesFromMats(C,A,B));
230 
231   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
232   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
233   c          = (Mat_SeqAIJ*)(C->data);
234   c->free_a  = PETSC_FALSE;
235   c->free_ij = PETSC_TRUE;
236   c->nonew   = 0;
237 
238   /* fast, needs non-scalable O(bn) array 'abdense' */
239   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
240 
241   /* set MatInfo */
242   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
243   if (afill < 1.0) afill = 1.0;
244   C->info.mallocs           = ndouble;
245   C->info.fill_ratio_given  = fill;
246   C->info.fill_ratio_needed = afill;
247 
248 #if defined(PETSC_USE_INFO)
249   if (ci[am]) {
250     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
251     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
252   } else {
253     PetscCall(PetscInfo(C,"Empty matrix product\n"));
254   }
255 #endif
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
260 {
261   PetscLogDouble    flops=0.0;
262   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
263   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
264   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
265   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
266   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
267   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
268   PetscScalar       *ca,valtmp;
269   PetscScalar       *ab_dense;
270   PetscContainer    cab_dense;
271   const PetscScalar *aa,*ba,*baj;
272 
273   PetscFunctionBegin;
274   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
275   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
276   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
277     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
278     c->a      = ca;
279     c->free_a = PETSC_TRUE;
280   } else ca = c->a;
281 
282   /* TODO this should be done in the symbolic phase */
283   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
284      that is hard to eradicate) */
285   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
286   if (!cab_dense) {
287     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
288     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
289     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
290     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
291     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
292     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
293   }
294   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
295   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
296 
297   /* clean old values in C */
298   PetscCall(PetscArrayzero(ca,ci[cm]));
299   /* Traverse A row-wise. */
300   /* Build the ith row in C by summing over nonzero columns in A, */
301   /* the rows of B corresponding to nonzeros of A. */
302   for (i=0; i<am; i++) {
303     anzi = ai[i+1] - ai[i];
304     for (j=0; j<anzi; j++) {
305       brow = aj[j];
306       bnzi = bi[brow+1] - bi[brow];
307       bjj  = bj + bi[brow];
308       baj  = ba + bi[brow];
309       /* perform dense axpy */
310       valtmp = aa[j];
311       for (k=0; k<bnzi; k++) {
312         ab_dense[bjj[k]] += valtmp*baj[k];
313       }
314       flops += 2*bnzi;
315     }
316     aj += anzi; aa += anzi;
317 
318     cnzi = ci[i+1] - ci[i];
319     for (k=0; k<cnzi; k++) {
320       ca[k]          += ab_dense[cj[k]];
321       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
322     }
323     flops += cnzi;
324     cj    += cnzi; ca += cnzi;
325   }
326 #if defined(PETSC_HAVE_DEVICE)
327   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
328 #endif
329   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
330   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
331   PetscCall(PetscLogFlops(flops));
332   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
333   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
334   PetscFunctionReturn(0);
335 }
336 
337 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
338 {
339   PetscLogDouble    flops=0.0;
340   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
341   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
342   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
343   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
344   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
345   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
346   PetscScalar       *ca=c->a,valtmp;
347   const PetscScalar *aa,*ba,*baj;
348   PetscInt          nextb;
349 
350   PetscFunctionBegin;
351   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
352   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
353   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
354     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
355     c->a      = ca;
356     c->free_a = PETSC_TRUE;
357   }
358 
359   /* clean old values in C */
360   PetscCall(PetscArrayzero(ca,ci[cm]));
361   /* Traverse A row-wise. */
362   /* Build the ith row in C by summing over nonzero columns in A, */
363   /* the rows of B corresponding to nonzeros of A. */
364   for (i=0; i<am; i++) {
365     anzi = ai[i+1] - ai[i];
366     cnzi = ci[i+1] - ci[i];
367     for (j=0; j<anzi; j++) {
368       brow = aj[j];
369       bnzi = bi[brow+1] - bi[brow];
370       bjj  = bj + bi[brow];
371       baj  = ba + bi[brow];
372       /* perform sparse axpy */
373       valtmp = aa[j];
374       nextb  = 0;
375       for (k=0; nextb<bnzi; k++) {
376         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
377           ca[k] += valtmp*baj[nextb++];
378         }
379       }
380       flops += 2*bnzi;
381     }
382     aj += anzi; aa += anzi;
383     cj += cnzi; ca += cnzi;
384   }
385 #if defined(PETSC_HAVE_DEVICE)
386   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
387 #endif
388   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
389   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
390   PetscCall(PetscLogFlops(flops));
391   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
392   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
393   PetscFunctionReturn(0);
394 }
395 
396 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
397 {
398   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
399   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
400   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
401   MatScalar          *ca;
402   PetscReal          afill;
403   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
404   PetscTable         ta;
405   PetscFreeSpaceList free_space=NULL,current_space=NULL;
406 
407   PetscFunctionBegin;
408   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
409   /*-----------------------------------------------------------------------------------------*/
410   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
411   PetscCall(PetscMalloc1(am+2,&ci));
412   ci[0] = 0;
413 
414   /* create and initialize a linked list */
415   PetscCall(PetscTableCreate(bn,bn,&ta));
416   MatRowMergeMax_SeqAIJ(b,bm,ta);
417   PetscCall(PetscTableGetCount(ta,&Crmax));
418   PetscCall(PetscTableDestroy(&ta));
419 
420   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
421 
422   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
423   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
424   current_space = free_space;
425 
426   /* Determine ci and cj */
427   for (i=0; i<am; i++) {
428     anzi = ai[i+1] - ai[i];
429     aj   = a->j + ai[i];
430     for (j=0; j<anzi; j++) {
431       brow = aj[j];
432       bnzj = bi[brow+1] - bi[brow];
433       bj   = b->j + bi[brow];
434       /* add non-zero cols of B into the sorted linked list lnk */
435       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
436     }
437     /* add possible missing diagonal entry */
438     if (C->force_diagonals) {
439       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
440     }
441     cnzi = lnk[1];
442 
443     /* If free space is not available, make more free space */
444     /* Double the amount of total space in the list */
445     if (current_space->local_remaining<cnzi) {
446       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
447       ndouble++;
448     }
449 
450     /* Copy data into free space, then initialize lnk */
451     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
452 
453     current_space->array           += cnzi;
454     current_space->local_used      += cnzi;
455     current_space->local_remaining -= cnzi;
456 
457     ci[i+1] = ci[i] + cnzi;
458   }
459 
460   /* Column indices are in the list of free space */
461   /* Allocate space for cj, initialize cj, and */
462   /* destroy list of free space and other temporary array(s) */
463   PetscCall(PetscMalloc1(ci[am]+1,&cj));
464   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
465   PetscCall(PetscLLCondensedDestroy_fast(lnk));
466 
467   /* Allocate space for ca */
468   PetscCall(PetscCalloc1(ci[am]+1,&ca));
469 
470   /* put together the new symbolic matrix */
471   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
472   PetscCall(MatSetBlockSizesFromMats(C,A,B));
473 
474   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
475   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
476   c          = (Mat_SeqAIJ*)(C->data);
477   c->free_a  = PETSC_TRUE;
478   c->free_ij = PETSC_TRUE;
479   c->nonew   = 0;
480 
481   /* slower, less memory */
482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
483 
484   /* set MatInfo */
485   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
486   if (afill < 1.0) afill = 1.0;
487   C->info.mallocs           = ndouble;
488   C->info.fill_ratio_given  = fill;
489   C->info.fill_ratio_needed = afill;
490 
491 #if defined(PETSC_USE_INFO)
492   if (ci[am]) {
493     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
494     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
495   } else {
496     PetscCall(PetscInfo(C,"Empty matrix product\n"));
497   }
498 #endif
499   PetscFunctionReturn(0);
500 }
501 
502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
503 {
504   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
505   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
506   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
507   MatScalar          *ca;
508   PetscReal          afill;
509   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
510   PetscTable         ta;
511   PetscFreeSpaceList free_space=NULL,current_space=NULL;
512 
513   PetscFunctionBegin;
514   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
515   /*---------------------------------------------------------------------------------------------*/
516   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
517   PetscCall(PetscMalloc1(am+2,&ci));
518   ci[0] = 0;
519 
520   /* create and initialize a linked list */
521   PetscCall(PetscTableCreate(bn,bn,&ta));
522   MatRowMergeMax_SeqAIJ(b,bm,ta);
523   PetscCall(PetscTableGetCount(ta,&Crmax));
524   PetscCall(PetscTableDestroy(&ta));
525   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
526 
527   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
528   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
529   current_space = free_space;
530 
531   /* Determine ci and cj */
532   for (i=0; i<am; i++) {
533     anzi = ai[i+1] - ai[i];
534     aj   = a->j + ai[i];
535     for (j=0; j<anzi; j++) {
536       brow = aj[j];
537       bnzj = bi[brow+1] - bi[brow];
538       bj   = b->j + bi[brow];
539       /* add non-zero cols of B into the sorted linked list lnk */
540       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
541     }
542     /* add possible missing diagonal entry */
543     if (C->force_diagonals) {
544       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
545     }
546 
547     cnzi = lnk[0];
548 
549     /* If free space is not available, make more free space */
550     /* Double the amount of total space in the list */
551     if (current_space->local_remaining<cnzi) {
552       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
553       ndouble++;
554     }
555 
556     /* Copy data into free space, then initialize lnk */
557     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
558 
559     current_space->array           += cnzi;
560     current_space->local_used      += cnzi;
561     current_space->local_remaining -= cnzi;
562 
563     ci[i+1] = ci[i] + cnzi;
564   }
565 
566   /* Column indices are in the list of free space */
567   /* Allocate space for cj, initialize cj, and */
568   /* destroy list of free space and other temporary array(s) */
569   PetscCall(PetscMalloc1(ci[am]+1,&cj));
570   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
571   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
572 
573   /* Allocate space for ca */
574   /*-----------------------*/
575   PetscCall(PetscCalloc1(ci[am]+1,&ca));
576 
577   /* put together the new symbolic matrix */
578   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
579   PetscCall(MatSetBlockSizesFromMats(C,A,B));
580 
581   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
582   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
583   c          = (Mat_SeqAIJ*)(C->data);
584   c->free_a  = PETSC_TRUE;
585   c->free_ij = PETSC_TRUE;
586   c->nonew   = 0;
587 
588   /* slower, less memory */
589   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
590 
591   /* set MatInfo */
592   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
593   if (afill < 1.0) afill = 1.0;
594   C->info.mallocs           = ndouble;
595   C->info.fill_ratio_given  = fill;
596   C->info.fill_ratio_needed = afill;
597 
598 #if defined(PETSC_USE_INFO)
599   if (ci[am]) {
600     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
601     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
602   } else {
603     PetscCall(PetscInfo(C,"Empty matrix product\n"));
604   }
605 #endif
606   PetscFunctionReturn(0);
607 }
608 
609 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
610 {
611   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
612   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
613   PetscInt           *ci,*cj,*bb;
614   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
615   PetscReal          afill;
616   PetscInt           i,j,col,ndouble = 0;
617   PetscFreeSpaceList free_space=NULL,current_space=NULL;
618   PetscHeap          h;
619 
620   PetscFunctionBegin;
621   /* Get ci and cj - by merging sorted rows using a heap */
622   /*---------------------------------------------------------------------------------------------*/
623   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
624   PetscCall(PetscMalloc1(am+2,&ci));
625   ci[0] = 0;
626 
627   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
628   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
629   current_space = free_space;
630 
631   PetscCall(PetscHeapCreate(a->rmax,&h));
632   PetscCall(PetscMalloc1(a->rmax,&bb));
633 
634   /* Determine ci and cj */
635   for (i=0; i<am; i++) {
636     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
637     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
638     ci[i+1] = ci[i];
639     /* Populate the min heap */
640     for (j=0; j<anzi; j++) {
641       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
642       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
643         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
644       }
645     }
646     /* Pick off the min element, adding it to free space */
647     PetscCall(PetscHeapPop(h,&j,&col));
648     while (j >= 0) {
649       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
650         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
651         ndouble++;
652       }
653       *(current_space->array++) = col;
654       current_space->local_used++;
655       current_space->local_remaining--;
656       ci[i+1]++;
657 
658       /* stash if anything else remains in this row of B */
659       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
660       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
661         PetscInt j2,col2;
662         PetscCall(PetscHeapPeek(h,&j2,&col2));
663         if (col2 != col) break;
664         PetscCall(PetscHeapPop(h,&j2,&col2));
665         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
666       }
667       /* Put any stashed elements back into the min heap */
668       PetscCall(PetscHeapUnstash(h));
669       PetscCall(PetscHeapPop(h,&j,&col));
670     }
671   }
672   PetscCall(PetscFree(bb));
673   PetscCall(PetscHeapDestroy(&h));
674 
675   /* Column indices are in the list of free space */
676   /* Allocate space for cj, initialize cj, and */
677   /* destroy list of free space and other temporary array(s) */
678   PetscCall(PetscMalloc1(ci[am],&cj));
679   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
680 
681   /* put together the new symbolic matrix */
682   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
683   PetscCall(MatSetBlockSizesFromMats(C,A,B));
684 
685   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687   c          = (Mat_SeqAIJ*)(C->data);
688   c->free_a  = PETSC_TRUE;
689   c->free_ij = PETSC_TRUE;
690   c->nonew   = 0;
691 
692   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
693 
694   /* set MatInfo */
695   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
696   if (afill < 1.0) afill = 1.0;
697   C->info.mallocs           = ndouble;
698   C->info.fill_ratio_given  = fill;
699   C->info.fill_ratio_needed = afill;
700 
701 #if defined(PETSC_USE_INFO)
702   if (ci[am]) {
703     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
704     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
705   } else {
706     PetscCall(PetscInfo(C,"Empty matrix product\n"));
707   }
708 #endif
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
713 {
714   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
715   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
716   PetscInt           *ci,*cj,*bb;
717   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
718   PetscReal          afill;
719   PetscInt           i,j,col,ndouble = 0;
720   PetscFreeSpaceList free_space=NULL,current_space=NULL;
721   PetscHeap          h;
722   PetscBT            bt;
723 
724   PetscFunctionBegin;
725   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
726   /*---------------------------------------------------------------------------------------------*/
727   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
728   PetscCall(PetscMalloc1(am+2,&ci));
729   ci[0] = 0;
730 
731   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
732   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
733 
734   current_space = free_space;
735 
736   PetscCall(PetscHeapCreate(a->rmax,&h));
737   PetscCall(PetscMalloc1(a->rmax,&bb));
738   PetscCall(PetscBTCreate(bn,&bt));
739 
740   /* Determine ci and cj */
741   for (i=0; i<am; i++) {
742     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
743     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
744     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
745     ci[i+1] = ci[i];
746     /* Populate the min heap */
747     for (j=0; j<anzi; j++) {
748       PetscInt brow = acol[j];
749       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
750         PetscInt bcol = bj[bb[j]];
751         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
752           PetscCall(PetscHeapAdd(h,j,bcol));
753           bb[j]++;
754           break;
755         }
756       }
757     }
758     /* Pick off the min element, adding it to free space */
759     PetscCall(PetscHeapPop(h,&j,&col));
760     while (j >= 0) {
761       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
762         fptr = NULL;                      /* need PetscBTMemzero */
763         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
764         ndouble++;
765       }
766       *(current_space->array++) = col;
767       current_space->local_used++;
768       current_space->local_remaining--;
769       ci[i+1]++;
770 
771       /* stash if anything else remains in this row of B */
772       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
773         PetscInt bcol = bj[bb[j]];
774         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
775           PetscCall(PetscHeapAdd(h,j,bcol));
776           bb[j]++;
777           break;
778         }
779       }
780       PetscCall(PetscHeapPop(h,&j,&col));
781     }
782     if (fptr) {                 /* Clear the bits for this row */
783       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
784     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
785       PetscCall(PetscBTMemzero(bn,bt));
786     }
787   }
788   PetscCall(PetscFree(bb));
789   PetscCall(PetscHeapDestroy(&h));
790   PetscCall(PetscBTDestroy(&bt));
791 
792   /* Column indices are in the list of free space */
793   /* Allocate space for cj, initialize cj, and */
794   /* destroy list of free space and other temporary array(s) */
795   PetscCall(PetscMalloc1(ci[am],&cj));
796   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
797 
798   /* put together the new symbolic matrix */
799   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
800   PetscCall(MatSetBlockSizesFromMats(C,A,B));
801 
802   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
803   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
804   c          = (Mat_SeqAIJ*)(C->data);
805   c->free_a  = PETSC_TRUE;
806   c->free_ij = PETSC_TRUE;
807   c->nonew   = 0;
808 
809   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
810 
811   /* set MatInfo */
812   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
813   if (afill < 1.0) afill = 1.0;
814   C->info.mallocs           = ndouble;
815   C->info.fill_ratio_given  = fill;
816   C->info.fill_ratio_needed = afill;
817 
818 #if defined(PETSC_USE_INFO)
819   if (ci[am]) {
820     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
821     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
822   } else {
823     PetscCall(PetscInfo(C,"Empty matrix product\n"));
824   }
825 #endif
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
830 {
831   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
832   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
833   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
834   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
835   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
836   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
837   const PetscInt     *brow_ptr[8],*brow_end[8];
838   PetscInt           window[8];
839   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
840   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
841   PetscReal          afill;
842   PetscInt           *workj_L1,*workj_L2,*workj_L3;
843   PetscInt           L1_nnz,L2_nnz;
844 
845   /* Step 1: Get upper bound on memory required for allocation.
846              Because of the way virtual memory works,
847              only the memory pages that are actually needed will be physically allocated. */
848   PetscFunctionBegin;
849   PetscCall(PetscMalloc1(am+1,&ci));
850   for (i=0; i<am; i++) {
851     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
852     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
853     a_rownnz = 0;
854     for (k=0; k<anzi; ++k) {
855       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
856       if (a_rownnz > bn) {
857         a_rownnz = bn;
858         break;
859       }
860     }
861     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
862   }
863   /* temporary work areas for merging rows */
864   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
865   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
866   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
867 
868   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
869   c_maxmem = 8*(ai[am]+bi[bm]);
870   /* Step 2: Populate pattern for C */
871   PetscCall(PetscMalloc1(c_maxmem,&cj));
872 
873   ci_nnz       = 0;
874   ci[0]        = 0;
875   worki_L1[0]  = 0;
876   worki_L2[0]  = 0;
877   for (i=0; i<am; i++) {
878     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
879     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
880     rowsleft             = anzi;
881     inputcol_L1          = acol;
882     L2_nnz               = 0;
883     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
884     worki_L2[1]          = 0;
885     outputi_nnz          = 0;
886 
887     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
888     while (ci_nnz+a_maxrownnz > c_maxmem) {
889       c_maxmem *= 2;
890       ndouble++;
891       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
892     }
893 
894     while (rowsleft) {
895       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
896       L1_nrows    = 0;
897       L1_nnz      = 0;
898       inputcol    = inputcol_L1;
899       inputi      = bi;
900       inputj      = bj;
901 
902       /* The following macro is used to specialize for small rows in A.
903          This helps with compiler unrolling, improving performance substantially.
904           Input:  inputj   inputi  inputcol  bn
905           Output: outputj  outputi_nnz                       */
906        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
907          window_min  = bn;                                                   \
908          outputi_nnz = 0;                                                    \
909          for (k=0; k<ANNZ; ++k) {                                            \
910            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
911            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
912            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
913            window_min  = PetscMin(window[k], window_min);                    \
914          }                                                                   \
915          while (window_min < bn) {                                           \
916            outputj[outputi_nnz++] = window_min;                              \
917            /* advance front and compute new minimum */                       \
918            old_window_min = window_min;                                      \
919            window_min = bn;                                                  \
920            for (k=0; k<ANNZ; ++k) {                                          \
921              if (window[k] == old_window_min) {                              \
922                brow_ptr[k]++;                                                \
923                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
924              }                                                               \
925              window_min = PetscMin(window[k], window_min);                   \
926            }                                                                 \
927          }
928 
929       /************** L E V E L  1 ***************/
930       /* Merge up to 8 rows of B to L1 work array*/
931       while (L1_rowsleft) {
932         outputi_nnz = 0;
933         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
934         else           outputj = cj + ci_nnz;           /* Merge directly to C */
935 
936         switch (L1_rowsleft) {
937         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
938                  brow_end[0] = inputj + inputi[inputcol[0]+1];
939                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
940                  inputcol    += L1_rowsleft;
941                  rowsleft    -= L1_rowsleft;
942                  L1_rowsleft  = 0;
943                  break;
944         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
945                  inputcol    += L1_rowsleft;
946                  rowsleft    -= L1_rowsleft;
947                  L1_rowsleft  = 0;
948                  break;
949         case 3: MatMatMultSymbolic_RowMergeMacro(3);
950                  inputcol    += L1_rowsleft;
951                  rowsleft    -= L1_rowsleft;
952                  L1_rowsleft  = 0;
953                  break;
954         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
955                  inputcol    += L1_rowsleft;
956                  rowsleft    -= L1_rowsleft;
957                  L1_rowsleft  = 0;
958                  break;
959         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
960                  inputcol    += L1_rowsleft;
961                  rowsleft    -= L1_rowsleft;
962                  L1_rowsleft  = 0;
963                  break;
964         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
965                  inputcol    += L1_rowsleft;
966                  rowsleft    -= L1_rowsleft;
967                  L1_rowsleft  = 0;
968                  break;
969         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
970                  inputcol    += L1_rowsleft;
971                  rowsleft    -= L1_rowsleft;
972                  L1_rowsleft  = 0;
973                  break;
974         default: MatMatMultSymbolic_RowMergeMacro(8);
975                  inputcol    += 8;
976                  rowsleft    -= 8;
977                  L1_rowsleft -= 8;
978                  break;
979         }
980         inputcol_L1           = inputcol;
981         L1_nnz               += outputi_nnz;
982         worki_L1[++L1_nrows]  = L1_nnz;
983       }
984 
985       /********************** L E V E L  2 ************************/
986       /* Merge from L1 work array to either C or to L2 work array */
987       if (anzi > 8) {
988         inputi      = worki_L1;
989         inputj      = workj_L1;
990         inputcol    = workcol;
991         outputi_nnz = 0;
992 
993         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
994         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
995 
996         switch (L1_nrows) {
997         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
998                  brow_end[0] = inputj + inputi[inputcol[0]+1];
999                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1000                  break;
1001         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1002         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1003         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1004         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1005         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1006         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1007         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1008         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1009         }
1010         L2_nnz               += outputi_nnz;
1011         worki_L2[++L2_nrows]  = L2_nnz;
1012 
1013         /************************ L E V E L  3 **********************/
1014         /* Merge from L2 work array to either C or to L2 work array */
1015         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1016           inputi      = worki_L2;
1017           inputj      = workj_L2;
1018           inputcol    = workcol;
1019           outputi_nnz = 0;
1020           if (rowsleft) outputj = workj_L3;
1021           else          outputj = cj + ci_nnz;
1022           switch (L2_nrows) {
1023           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1024                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1025                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1026                    break;
1027           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1028           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1029           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1030           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1031           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1032           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1033           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1034           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1035           }
1036           L2_nrows    = 1;
1037           L2_nnz      = outputi_nnz;
1038           worki_L2[1] = outputi_nnz;
1039           /* Copy to workj_L2 */
1040           if (rowsleft) {
1041             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1042           }
1043         }
1044       }
1045     }  /* while (rowsleft) */
1046 #undef MatMatMultSymbolic_RowMergeMacro
1047 
1048     /* terminate current row */
1049     ci_nnz += outputi_nnz;
1050     ci[i+1] = ci_nnz;
1051   }
1052 
1053   /* Step 3: Create the new symbolic matrix */
1054   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1055   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1056 
1057   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1058   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1059   c          = (Mat_SeqAIJ*)(C->data);
1060   c->free_a  = PETSC_TRUE;
1061   c->free_ij = PETSC_TRUE;
1062   c->nonew   = 0;
1063 
1064   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1065 
1066   /* set MatInfo */
1067   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1068   if (afill < 1.0) afill = 1.0;
1069   C->info.mallocs           = ndouble;
1070   C->info.fill_ratio_given  = fill;
1071   C->info.fill_ratio_needed = afill;
1072 
1073 #if defined(PETSC_USE_INFO)
1074   if (ci[am]) {
1075     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1076     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1077   } else {
1078     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1079   }
1080 #endif
1081 
1082   /* Step 4: Free temporary work areas */
1083   PetscCall(PetscFree(workj_L1));
1084   PetscCall(PetscFree(workj_L2));
1085   PetscCall(PetscFree(workj_L3));
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* concatenate unique entries and then sort */
1090 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1091 {
1092   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1093   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1094   PetscInt       *ci,*cj,bcol;
1095   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1096   PetscReal      afill;
1097   PetscInt       i,j,ndouble = 0;
1098   PetscSegBuffer seg,segrow;
1099   char           *seen;
1100 
1101   PetscFunctionBegin;
1102   PetscCall(PetscMalloc1(am+1,&ci));
1103   ci[0] = 0;
1104 
1105   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1106   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1107   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1108   PetscCall(PetscCalloc1(bn,&seen));
1109 
1110   /* Determine ci and cj */
1111   for (i=0; i<am; i++) {
1112     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1113     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1114     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1115 
1116     /* Pack segrow */
1117     for (j=0; j<anzi; j++) {
1118       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1119       for (k=bjstart; k<bjend; k++) {
1120         bcol = bj[k];
1121         if (!seen[bcol]) { /* new entry */
1122           PetscInt *PETSC_RESTRICT slot;
1123           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1124           *slot = bcol;
1125           seen[bcol] = 1;
1126           packlen++;
1127         }
1128       }
1129     }
1130 
1131     /* Check i-th diagonal entry */
1132     if (C->force_diagonals && !seen[i]) {
1133       PetscInt *PETSC_RESTRICT slot;
1134       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1135       *slot   = i;
1136       seen[i] = 1;
1137       packlen++;
1138     }
1139 
1140     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1141     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1142     PetscCall(PetscSortInt(packlen,crow));
1143     ci[i+1] = ci[i] + packlen;
1144     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1145   }
1146   PetscCall(PetscSegBufferDestroy(&segrow));
1147   PetscCall(PetscFree(seen));
1148 
1149   /* Column indices are in the segmented buffer */
1150   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1151   PetscCall(PetscSegBufferDestroy(&seg));
1152 
1153   /* put together the new symbolic matrix */
1154   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1155   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1156 
1157   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1158   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1159   c          = (Mat_SeqAIJ*)(C->data);
1160   c->free_a  = PETSC_TRUE;
1161   c->free_ij = PETSC_TRUE;
1162   c->nonew   = 0;
1163 
1164   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1165 
1166   /* set MatInfo */
1167   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1168   if (afill < 1.0) afill = 1.0;
1169   C->info.mallocs           = ndouble;
1170   C->info.fill_ratio_given  = fill;
1171   C->info.fill_ratio_needed = afill;
1172 
1173 #if defined(PETSC_USE_INFO)
1174   if (ci[am]) {
1175     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1176     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1177   } else {
1178     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1179   }
1180 #endif
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1185 {
1186   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1190   PetscCall(MatDestroy(&abt->Bt_den));
1191   PetscCall(MatDestroy(&abt->ABt_den));
1192   PetscCall(PetscFree(abt));
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1197 {
1198   Mat                 Bt;
1199   PetscInt            *bti,*btj;
1200   Mat_MatMatTransMult *abt;
1201   Mat_Product         *product = C->product;
1202   char                *alg;
1203 
1204   PetscFunctionBegin;
1205   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1206   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1207 
1208   /* create symbolic Bt */
1209   PetscCall(MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1210   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt));
1211   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1212   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1213 
1214   /* get symbolic C=A*Bt */
1215   PetscCall(PetscStrallocpy(product->alg,&alg));
1216   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1217   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1218   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1219   PetscCall(PetscFree(alg));
1220 
1221   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1222   PetscCall(PetscNew(&abt));
1223 
1224   product->data    = abt;
1225   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1226 
1227   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1228 
1229   abt->usecoloring = PETSC_FALSE;
1230   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1231   if (abt->usecoloring) {
1232     /* Create MatTransposeColoring from symbolic C=A*B^T */
1233     MatTransposeColoring matcoloring;
1234     MatColoring          coloring;
1235     ISColoring           iscoloring;
1236     Mat                  Bt_dense,C_dense;
1237 
1238     /* inode causes memory problem */
1239     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1240 
1241     PetscCall(MatColoringCreate(C,&coloring));
1242     PetscCall(MatColoringSetDistance(coloring,2));
1243     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1244     PetscCall(MatColoringSetFromOptions(coloring));
1245     PetscCall(MatColoringApply(coloring,&iscoloring));
1246     PetscCall(MatColoringDestroy(&coloring));
1247     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1248 
1249     abt->matcoloring = matcoloring;
1250 
1251     PetscCall(ISColoringDestroy(&iscoloring));
1252 
1253     /* Create Bt_dense and C_dense = A*Bt_dense */
1254     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1255     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1256     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1257     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1258 
1259     Bt_dense->assembled = PETSC_TRUE;
1260     abt->Bt_den         = Bt_dense;
1261 
1262     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1263     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1264     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1265     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1266 
1267     Bt_dense->assembled = PETSC_TRUE;
1268     abt->ABt_den  = C_dense;
1269 
1270 #if defined(PETSC_USE_INFO)
1271     {
1272       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1273       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1274     }
1275 #endif
1276   }
1277   /* clean up */
1278   PetscCall(MatDestroy(&Bt));
1279   PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj));
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1284 {
1285   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1286   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1287   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1288   PetscLogDouble      flops=0.0;
1289   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1290   Mat_MatMatTransMult *abt;
1291   Mat_Product         *product = C->product;
1292 
1293   PetscFunctionBegin;
1294   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1295   abt = (Mat_MatMatTransMult *)product->data;
1296   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1297   /* clear old values in C */
1298   if (!c->a) {
1299     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1300     c->a      = ca;
1301     c->free_a = PETSC_TRUE;
1302   } else {
1303     ca =  c->a;
1304     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1305   }
1306 
1307   if (abt->usecoloring) {
1308     MatTransposeColoring matcoloring = abt->matcoloring;
1309     Mat                  Bt_dense,C_dense = abt->ABt_den;
1310 
1311     /* Get Bt_dense by Apply MatTransposeColoring to B */
1312     Bt_dense = abt->Bt_den;
1313     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1314 
1315     /* C_dense = A*Bt_dense */
1316     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1317 
1318     /* Recover C from C_dense */
1319     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1320     PetscFunctionReturn(0);
1321   }
1322 
1323   for (i=0; i<cm; i++) {
1324     anzi = ai[i+1] - ai[i];
1325     acol = aj + ai[i];
1326     aval = aa + ai[i];
1327     cnzi = ci[i+1] - ci[i];
1328     ccol = cj + ci[i];
1329     cval = ca + ci[i];
1330     for (j=0; j<cnzi; j++) {
1331       brow = ccol[j];
1332       bnzj = bi[brow+1] - bi[brow];
1333       bcol = bj + bi[brow];
1334       bval = ba + bi[brow];
1335 
1336       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1337       nexta = 0; nextb = 0;
1338       while (nexta<anzi && nextb<bnzj) {
1339         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1340         if (nexta == anzi) break;
1341         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1342         if (nextb == bnzj) break;
1343         if (acol[nexta] == bcol[nextb]) {
1344           cval[j] += aval[nexta]*bval[nextb];
1345           nexta++; nextb++;
1346           flops += 2;
1347         }
1348       }
1349     }
1350   }
1351   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1352   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1353   PetscCall(PetscLogFlops(flops));
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1358 {
1359   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1360 
1361   PetscFunctionBegin;
1362   PetscCall(MatDestroy(&atb->At));
1363   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1364   PetscCall(PetscFree(atb));
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1369 {
1370   Mat            At = NULL;
1371   PetscInt       *ati,*atj;
1372   Mat_Product    *product = C->product;
1373   PetscBool      flg,def,square;
1374 
1375   PetscFunctionBegin;
1376   MatCheckProduct(C,4);
1377   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
1378   /* outerproduct */
1379   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1380   if (flg) {
1381     /* create symbolic At */
1382     if (!square) {
1383       PetscCall(MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1384       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At));
1385       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1386       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1387     }
1388     /* get symbolic C=At*B */
1389     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1390     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1391 
1392     /* clean up */
1393     if (!square) {
1394       PetscCall(MatDestroy(&At));
1395       PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj));
1396     }
1397 
1398     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1399     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1400     PetscFunctionReturn(0);
1401   }
1402 
1403   /* matmatmult */
1404   PetscCall(PetscStrcmp(product->alg,"default",&def));
1405   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1406   if (flg || def) {
1407     Mat_MatTransMatMult *atb;
1408 
1409     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1410     PetscCall(PetscNew(&atb));
1411     if (!square) {
1412       PetscCall(MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At));
1413     }
1414     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1415     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1416     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1417     product->data    = atb;
1418     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1419     atb->At          = At;
1420     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
1421 
1422     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1423     PetscFunctionReturn(0);
1424   }
1425 
1426   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1427 }
1428 
1429 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1430 {
1431   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1432   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1433   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1434   PetscLogDouble flops=0.0;
1435   MatScalar      *aa=a->a,*ba,*ca,*caj;
1436 
1437   PetscFunctionBegin;
1438   if (!c->a) {
1439     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1440 
1441     c->a      = ca;
1442     c->free_a = PETSC_TRUE;
1443   } else {
1444     ca   = c->a;
1445     PetscCall(PetscArrayzero(ca,ci[cm]));
1446   }
1447 
1448   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1449   for (i=0; i<am; i++) {
1450     bj   = b->j + bi[i];
1451     ba   = b->a + bi[i];
1452     bnzi = bi[i+1] - bi[i];
1453     anzi = ai[i+1] - ai[i];
1454     for (j=0; j<anzi; j++) {
1455       nextb = 0;
1456       crow  = *aj++;
1457       cjj   = cj + ci[crow];
1458       caj   = ca + ci[crow];
1459       /* perform sparse axpy operation.  Note cjj includes bj. */
1460       for (k=0; nextb<bnzi; k++) {
1461         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1462           caj[k] += (*aa)*(*(ba+nextb));
1463           nextb++;
1464         }
1465       }
1466       flops += 2*bnzi;
1467       aa++;
1468     }
1469   }
1470 
1471   /* Assemble the final matrix and clean up */
1472   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1473   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1474   PetscCall(PetscLogFlops(flops));
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1479 {
1480   PetscFunctionBegin;
1481   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1482   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1483   PetscFunctionReturn(0);
1484 }
1485 
1486 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1487 {
1488   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1489   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1490   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1491   const PetscInt    *aj;
1492   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1493   PetscInt          clda;
1494   PetscInt          am4,bm4,col,i,j,n;
1495 
1496   PetscFunctionBegin;
1497   if (!cm || !cn) PetscFunctionReturn(0);
1498   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1499   if (add) {
1500     PetscCall(MatDenseGetArray(C,&c));
1501   } else {
1502     PetscCall(MatDenseGetArrayWrite(C,&c));
1503   }
1504   PetscCall(MatDenseGetArrayRead(B,&b));
1505   PetscCall(MatDenseGetLDA(B,&bm));
1506   PetscCall(MatDenseGetLDA(C,&clda));
1507   am4 = 4*clda;
1508   bm4 = 4*bm;
1509   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1510   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1511   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1512     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1513       r1 = r2 = r3 = r4 = 0.0;
1514       n  = a->i[i+1] - a->i[i];
1515       aj = a->j + a->i[i];
1516       aa = av + a->i[i];
1517       for (j=0; j<n; j++) {
1518         const PetscScalar aatmp = aa[j];
1519         const PetscInt    ajtmp = aj[j];
1520         r1 += aatmp*b1[ajtmp];
1521         r2 += aatmp*b2[ajtmp];
1522         r3 += aatmp*b3[ajtmp];
1523         r4 += aatmp*b4[ajtmp];
1524       }
1525       if (add) {
1526         c1[i] += r1;
1527         c2[i] += r2;
1528         c3[i] += r3;
1529         c4[i] += r4;
1530       } else {
1531         c1[i] = r1;
1532         c2[i] = r2;
1533         c3[i] = r3;
1534         c4[i] = r4;
1535       }
1536     }
1537     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1538     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1539   }
1540   /* process remaining columns */
1541   if (col != cn) {
1542     PetscInt rc = cn-col;
1543 
1544     if (rc == 1) {
1545       for (i=0; i<am; i++) {
1546         r1 = 0.0;
1547         n  = a->i[i+1] - a->i[i];
1548         aj = a->j + a->i[i];
1549         aa = av + a->i[i];
1550         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1551         if (add) c1[i] += r1;
1552         else c1[i] = r1;
1553       }
1554     } else if (rc == 2) {
1555       for (i=0; i<am; i++) {
1556         r1 = r2 = 0.0;
1557         n  = a->i[i+1] - a->i[i];
1558         aj = a->j + a->i[i];
1559         aa = av + a->i[i];
1560         for (j=0; j<n; j++) {
1561           const PetscScalar aatmp = aa[j];
1562           const PetscInt    ajtmp = aj[j];
1563           r1 += aatmp*b1[ajtmp];
1564           r2 += aatmp*b2[ajtmp];
1565         }
1566         if (add) {
1567           c1[i] += r1;
1568           c2[i] += r2;
1569         } else {
1570           c1[i] = r1;
1571           c2[i] = r2;
1572         }
1573       }
1574     } else {
1575       for (i=0; i<am; i++) {
1576         r1 = r2 = r3 = 0.0;
1577         n  = a->i[i+1] - a->i[i];
1578         aj = a->j + a->i[i];
1579         aa = av + a->i[i];
1580         for (j=0; j<n; j++) {
1581           const PetscScalar aatmp = aa[j];
1582           const PetscInt    ajtmp = aj[j];
1583           r1 += aatmp*b1[ajtmp];
1584           r2 += aatmp*b2[ajtmp];
1585           r3 += aatmp*b3[ajtmp];
1586         }
1587         if (add) {
1588           c1[i] += r1;
1589           c2[i] += r2;
1590           c3[i] += r3;
1591         } else {
1592           c1[i] = r1;
1593           c2[i] = r2;
1594           c3[i] = r3;
1595         }
1596       }
1597     }
1598   }
1599   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1600   if (add) {
1601     PetscCall(MatDenseRestoreArray(C,&c));
1602   } else {
1603     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1604   }
1605   PetscCall(MatDenseRestoreArrayRead(B,&b));
1606   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1611 {
1612   PetscFunctionBegin;
1613   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1614   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1615   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1616 
1617   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 /* ------------------------------------------------------- */
1622 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1623 {
1624   PetscFunctionBegin;
1625   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1626   C->ops->productsymbolic = MatProductSymbolic_AB;
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1631 
1632 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1633 {
1634   PetscFunctionBegin;
1635   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1636   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1641 {
1642   PetscFunctionBegin;
1643   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1644   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1649 {
1650   Mat_Product    *product = C->product;
1651 
1652   PetscFunctionBegin;
1653   switch (product->type) {
1654   case MATPRODUCT_AB:
1655     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1656     break;
1657   case MATPRODUCT_AtB:
1658     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1659     break;
1660   case MATPRODUCT_ABt:
1661     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1662     break;
1663   default:
1664     break;
1665   }
1666   PetscFunctionReturn(0);
1667 }
1668 /* ------------------------------------------------------- */
1669 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1670 {
1671   Mat_Product    *product = C->product;
1672   Mat            A = product->A;
1673   PetscBool      baij;
1674 
1675   PetscFunctionBegin;
1676   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1677   if (!baij) { /* A is seqsbaij */
1678     PetscBool sbaij;
1679     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1680     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1681 
1682     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1683   } else { /* A is seqbaij */
1684     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1685   }
1686 
1687   C->ops->productsymbolic = MatProductSymbolic_AB;
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1692 {
1693   Mat_Product    *product = C->product;
1694 
1695   PetscFunctionBegin;
1696   MatCheckProduct(C,1);
1697   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1698   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
1699     PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1700   }
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 /* ------------------------------------------------------- */
1705 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1706 {
1707   PetscFunctionBegin;
1708   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1709   C->ops->productsymbolic = MatProductSymbolic_AB;
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1714 {
1715   Mat_Product    *product = C->product;
1716 
1717   PetscFunctionBegin;
1718   if (product->type == MATPRODUCT_AB) {
1719     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 /* ------------------------------------------------------- */
1724 
1725 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1726 {
1727   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1728   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1729   PetscInt       *bi      = b->i,*bj=b->j;
1730   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1731   MatScalar      *btval,*btval_den,*ba=b->a;
1732   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1733 
1734   PetscFunctionBegin;
1735   btval_den=btdense->v;
1736   PetscCall(PetscArrayzero(btval_den,m*n));
1737   for (k=0; k<ncolors; k++) {
1738     ncolumns = coloring->ncolumns[k];
1739     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1740       col   = *(columns + colorforcol[k] + l);
1741       btcol = bj + bi[col];
1742       btval = ba + bi[col];
1743       anz   = bi[col+1] - bi[col];
1744       for (j=0; j<anz; j++) {
1745         brow            = btcol[j];
1746         btval_den[brow] = btval[j];
1747       }
1748     }
1749     btval_den += m;
1750   }
1751   PetscFunctionReturn(0);
1752 }
1753 
1754 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1755 {
1756   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1757   const PetscScalar *ca_den,*ca_den_ptr;
1758   PetscScalar       *ca=csp->a;
1759   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1760   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1761   PetscInt          nrows,*row,*idx;
1762   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1763 
1764   PetscFunctionBegin;
1765   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1766 
1767   if (brows > 0) {
1768     PetscInt *lstart,row_end,row_start;
1769     lstart = matcoloring->lstart;
1770     PetscCall(PetscArrayzero(lstart,ncolors));
1771 
1772     row_end = brows;
1773     if (row_end > m) row_end = m;
1774     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1775       ca_den_ptr = ca_den;
1776       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1777         nrows = matcoloring->nrows[k];
1778         row   = rows  + colorforrow[k];
1779         idx   = den2sp + colorforrow[k];
1780         for (l=lstart[k]; l<nrows; l++) {
1781           if (row[l] >= row_end) {
1782             lstart[k] = l;
1783             break;
1784           } else {
1785             ca[idx[l]] = ca_den_ptr[row[l]];
1786           }
1787         }
1788         ca_den_ptr += m;
1789       }
1790       row_end += brows;
1791       if (row_end > m) row_end = m;
1792     }
1793   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1794     ca_den_ptr = ca_den;
1795     for (k=0; k<ncolors; k++) {
1796       nrows = matcoloring->nrows[k];
1797       row   = rows  + colorforrow[k];
1798       idx   = den2sp + colorforrow[k];
1799       for (l=0; l<nrows; l++) {
1800         ca[idx[l]] = ca_den_ptr[row[l]];
1801       }
1802       ca_den_ptr += m;
1803     }
1804   }
1805 
1806   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1807 #if defined(PETSC_USE_INFO)
1808   if (matcoloring->brows > 0) {
1809     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1810   } else {
1811     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1812   }
1813 #endif
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1818 {
1819   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1820   const PetscInt *is,*ci,*cj,*row_idx;
1821   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1822   IS             *isa;
1823   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1824   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1825   PetscInt       *colorforcol,*columns,*columns_i,brows;
1826   PetscBool      flg;
1827 
1828   PetscFunctionBegin;
1829   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1830 
1831   /* bs >1 is not being tested yet! */
1832   Nbs       = mat->cmap->N/bs;
1833   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1834   c->N      = Nbs;
1835   c->m      = c->M;
1836   c->rstart = 0;
1837   c->brows  = 100;
1838 
1839   c->ncolors = nis;
1840   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1841   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1842   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1843 
1844   brows = c->brows;
1845   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1846   if (flg) c->brows = brows;
1847   if (brows > 0) {
1848     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1849   }
1850 
1851   colorforrow[0] = 0;
1852   rows_i         = rows;
1853   den2sp_i       = den2sp;
1854 
1855   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1856   PetscCall(PetscMalloc1(Nbs+1,&columns));
1857 
1858   colorforcol[0] = 0;
1859   columns_i      = columns;
1860 
1861   /* get column-wise storage of mat */
1862   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1863 
1864   cm   = c->m;
1865   PetscCall(PetscMalloc1(cm+1,&rowhit));
1866   PetscCall(PetscMalloc1(cm+1,&idxhit));
1867   for (i=0; i<nis; i++) { /* loop over color */
1868     PetscCall(ISGetLocalSize(isa[i],&n));
1869     PetscCall(ISGetIndices(isa[i],&is));
1870 
1871     c->ncolumns[i] = n;
1872     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1873     colorforcol[i+1] = colorforcol[i] + n;
1874     columns_i       += n;
1875 
1876     /* fast, crude version requires O(N*N) work */
1877     PetscCall(PetscArrayzero(rowhit,cm));
1878 
1879     for (j=0; j<n; j++) { /* loop over columns*/
1880       col     = is[j];
1881       row_idx = cj + ci[col];
1882       m       = ci[col+1] - ci[col];
1883       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1884         idxhit[*row_idx]   = spidx[ci[col] + k];
1885         rowhit[*row_idx++] = col + 1;
1886       }
1887     }
1888     /* count the number of hits */
1889     nrows = 0;
1890     for (j=0; j<cm; j++) {
1891       if (rowhit[j]) nrows++;
1892     }
1893     c->nrows[i]      = nrows;
1894     colorforrow[i+1] = colorforrow[i] + nrows;
1895 
1896     nrows = 0;
1897     for (j=0; j<cm; j++) { /* loop over rows */
1898       if (rowhit[j]) {
1899         rows_i[nrows]   = j;
1900         den2sp_i[nrows] = idxhit[j];
1901         nrows++;
1902       }
1903     }
1904     den2sp_i += nrows;
1905 
1906     PetscCall(ISRestoreIndices(isa[i],&is));
1907     rows_i += nrows;
1908   }
1909   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1910   PetscCall(PetscFree(rowhit));
1911   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1912   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1913 
1914   c->colorforrow = colorforrow;
1915   c->rows        = rows;
1916   c->den2sp      = den2sp;
1917   c->colorforcol = colorforcol;
1918   c->columns     = columns;
1919 
1920   PetscCall(PetscFree(idxhit));
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 /* --------------------------------------------------------------- */
1925 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1926 {
1927   Mat_Product    *product = C->product;
1928   Mat            A=product->A,B=product->B;
1929 
1930   PetscFunctionBegin;
1931   if (C->ops->mattransposemultnumeric) {
1932     /* Alg: "outerproduct" */
1933     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1934   } else {
1935     /* Alg: "matmatmult" -- C = At*B */
1936     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1937     Mat                 At;
1938 
1939     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1940     At = atb->At;
1941     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1942       PetscCall(MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At));
1943     }
1944     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C));
1945     atb->updateAt = PETSC_TRUE;
1946   }
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1951 {
1952   Mat_Product    *product = C->product;
1953   Mat            A=product->A,B=product->B;
1954   PetscReal      fill=product->fill;
1955 
1956   PetscFunctionBegin;
1957   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1958 
1959   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1960   PetscFunctionReturn(0);
1961 }
1962 
1963 /* --------------------------------------------------------------- */
1964 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1965 {
1966   Mat_Product    *product = C->product;
1967   PetscInt       alg = 0; /* default algorithm */
1968   PetscBool      flg = PETSC_FALSE;
1969 #if !defined(PETSC_HAVE_HYPRE)
1970   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1971   PetscInt       nalg = 7;
1972 #else
1973   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1974   PetscInt       nalg = 8;
1975 #endif
1976 
1977   PetscFunctionBegin;
1978   /* Set default algorithm */
1979   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1980   if (flg) {
1981     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1982   }
1983 
1984   /* Get runtime option */
1985   if (product->api_user) {
1986     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1987     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1988     PetscOptionsEnd();
1989   } else {
1990     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1991     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1992     PetscOptionsEnd();
1993   }
1994   if (flg) {
1995     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1996   }
1997 
1998   C->ops->productsymbolic = MatProductSymbolic_AB;
1999   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2000   PetscFunctionReturn(0);
2001 }
2002 
2003 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2004 {
2005   Mat_Product    *product = C->product;
2006   PetscInt       alg = 0; /* default algorithm */
2007   PetscBool      flg = PETSC_FALSE;
2008   const char     *algTypes[3] = {"default","at*b","outerproduct"};
2009   PetscInt       nalg = 3;
2010 
2011   PetscFunctionBegin;
2012   /* Get runtime option */
2013   if (product->api_user) {
2014     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2015     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2016     PetscOptionsEnd();
2017   } else {
2018     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2019     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2020     PetscOptionsEnd();
2021   }
2022   if (flg) {
2023     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2024   }
2025 
2026   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2027   PetscFunctionReturn(0);
2028 }
2029 
2030 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2031 {
2032   Mat_Product    *product = C->product;
2033   PetscInt       alg = 0; /* default algorithm */
2034   PetscBool      flg = PETSC_FALSE;
2035   const char     *algTypes[2] = {"default","color"};
2036   PetscInt       nalg = 2;
2037 
2038   PetscFunctionBegin;
2039   /* Set default algorithm */
2040   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2041   if (!flg) {
2042     alg = 1;
2043     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2044   }
2045 
2046   /* Get runtime option */
2047   if (product->api_user) {
2048     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2049     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2050     PetscOptionsEnd();
2051   } else {
2052     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2053     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2054     PetscOptionsEnd();
2055   }
2056   if (flg) {
2057     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2058   }
2059 
2060   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2061   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2066 {
2067   Mat_Product    *product = C->product;
2068   PetscBool      flg = PETSC_FALSE;
2069   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2070 #if !defined(PETSC_HAVE_HYPRE)
2071   const char     *algTypes[2] = {"scalable","rap"};
2072   PetscInt       nalg = 2;
2073 #else
2074   const char     *algTypes[3] = {"scalable","rap","hypre"};
2075   PetscInt       nalg = 3;
2076 #endif
2077 
2078   PetscFunctionBegin;
2079   /* Set default algorithm */
2080   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2081   if (flg) {
2082     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2083   }
2084 
2085   /* Get runtime option */
2086   if (product->api_user) {
2087     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2088     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2089     PetscOptionsEnd();
2090   } else {
2091     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2092     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2093     PetscOptionsEnd();
2094   }
2095   if (flg) {
2096     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2097   }
2098 
2099   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2100   PetscFunctionReturn(0);
2101 }
2102 
2103 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2104 {
2105   Mat_Product    *product = C->product;
2106   PetscBool      flg = PETSC_FALSE;
2107   PetscInt       alg = 0; /* default algorithm */
2108   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2109   PetscInt        nalg = 3;
2110 
2111   PetscFunctionBegin;
2112   /* Set default algorithm */
2113   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2114   if (flg) {
2115     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2116   }
2117 
2118   /* Get runtime option */
2119   if (product->api_user) {
2120     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2121     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2122     PetscOptionsEnd();
2123   } else {
2124     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2125     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2126     PetscOptionsEnd();
2127   }
2128   if (flg) {
2129     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2130   }
2131 
2132   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2133   PetscFunctionReturn(0);
2134 }
2135 
2136 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2137 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2138 {
2139   Mat_Product    *product = C->product;
2140   PetscInt       alg = 0; /* default algorithm */
2141   PetscBool      flg = PETSC_FALSE;
2142   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2143   PetscInt       nalg = 7;
2144 
2145   PetscFunctionBegin;
2146   /* Set default algorithm */
2147   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2148   if (flg) {
2149     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2150   }
2151 
2152   /* Get runtime option */
2153   if (product->api_user) {
2154     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2155     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2156     PetscOptionsEnd();
2157   } else {
2158     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2159     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2160     PetscOptionsEnd();
2161   }
2162   if (flg) {
2163     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2164   }
2165 
2166   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2167   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2168   PetscFunctionReturn(0);
2169 }
2170 
2171 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2172 {
2173   Mat_Product    *product = C->product;
2174 
2175   PetscFunctionBegin;
2176   switch (product->type) {
2177   case MATPRODUCT_AB:
2178     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2179     break;
2180   case MATPRODUCT_AtB:
2181     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2182     break;
2183   case MATPRODUCT_ABt:
2184     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2185     break;
2186   case MATPRODUCT_PtAP:
2187     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2188     break;
2189   case MATPRODUCT_RARt:
2190     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2191     break;
2192   case MATPRODUCT_ABC:
2193     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2194     break;
2195   default:
2196     break;
2197   }
2198   PetscFunctionReturn(0);
2199 }
2200