xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 117ef88edefbfc12e7c19efe87a19a2e1b0acd4f)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscErrorCode ierr;
16 
17   PetscFunctionBegin;
18   if (C->ops->matmultnumeric) {
19     if (C->ops->matmultnumeric == MatMatMultNumeric_SeqAIJ_SeqAIJ) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Recursive call");
20     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
21   } else {
22     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
23   }
24   PetscFunctionReturn(0);
25 }
26 
27 /* Modified from MatCreateSeqAIJWithArrays() */
28 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
29 {
30   PetscErrorCode ierr;
31   PetscInt       ii;
32   Mat_SeqAIJ     *aij;
33   PetscBool      isseqaij;
34 
35   PetscFunctionBegin;
36   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
37   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
38 
39   if (!mtype) {
40     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
41     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
42   } else {
43     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
44   }
45   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,0);CHKERRQ(ierr);
46   aij  = (Mat_SeqAIJ*)(mat)->data;
47   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
48   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
49 
50   aij->i            = i;
51   aij->j            = j;
52   aij->a            = a;
53   aij->singlemalloc = PETSC_FALSE;
54   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
55   aij->free_a       = PETSC_FALSE;
56   aij->free_ij      = PETSC_FALSE;
57 
58   for (ii=0; ii<m; ii++) {
59     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
60   }
61 
62   PetscFunctionReturn(0);
63 }
64 
65 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
66 {
67   PetscErrorCode      ierr;
68   Mat_Product         *product = C->product;
69   MatProductAlgorithm alg;
70   PetscBool           flg;
71 
72   PetscFunctionBegin;
73   if (product) {
74     alg = product->alg;
75   } else {
76     alg = "sorted";
77   }
78   /* sorted */
79   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
80   if (flg) {
81     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
82     PetscFunctionReturn(0);
83   }
84 
85   /* scalable */
86   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
87   if (flg) {
88     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
89     PetscFunctionReturn(0);
90   }
91 
92   /* scalable_fast */
93   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
94   if (flg) {
95     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
96     PetscFunctionReturn(0);
97   }
98 
99   /* heap */
100   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
101   if (flg) {
102     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
103     PetscFunctionReturn(0);
104   }
105 
106   /* btheap */
107   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
108   if (flg) {
109     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
110     PetscFunctionReturn(0);
111   }
112 
113   /* llcondensed */
114   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
115   if (flg) {
116     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
117     PetscFunctionReturn(0);
118   }
119 
120   /* rowmerge */
121   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
122   if (flg) {
123     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
124     PetscFunctionReturn(0);
125   }
126 
127 #if defined(PETSC_HAVE_HYPRE)
128   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
129   if (flg) {
130     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
131     PetscFunctionReturn(0);
132   }
133 #endif
134 
135   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
136   PetscFunctionReturn(0);
137 }
138 
139 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
140 {
141   PetscErrorCode     ierr;
142   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
143   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
144   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
145   PetscReal          afill;
146   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
147   PetscTable         ta;
148   PetscBT            lnkbt;
149   PetscFreeSpaceList free_space=NULL,current_space=NULL;
150 
151   PetscFunctionBegin;
152   /* Get ci and cj */
153   /*---------------*/
154   /* Allocate ci array, arrays for fill computation and */
155   /* free space for accumulating nonzero column info */
156   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
157   ci[0] = 0;
158 
159   /* create and initialize a linked list */
160   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
161   MatRowMergeMax_SeqAIJ(b,bm,ta);
162   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
163   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
164 
165   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
166 
167   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
168   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
169 
170   current_space = free_space;
171 
172   /* Determine ci and cj */
173   for (i=0; i<am; i++) {
174     anzi = ai[i+1] - ai[i];
175     aj   = a->j + ai[i];
176     for (j=0; j<anzi; j++) {
177       brow = aj[j];
178       bnzj = bi[brow+1] - bi[brow];
179       bj   = b->j + bi[brow];
180       /* add non-zero cols of B into the sorted linked list lnk */
181       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
182     }
183     cnzi = lnk[0];
184 
185     /* If free space is not available, make more free space */
186     /* Double the amount of total space in the list */
187     if (current_space->local_remaining<cnzi) {
188       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
189       ndouble++;
190     }
191 
192     /* Copy data into free space, then initialize lnk */
193     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
194 
195     current_space->array           += cnzi;
196     current_space->local_used      += cnzi;
197     current_space->local_remaining -= cnzi;
198 
199     ci[i+1] = ci[i] + cnzi;
200   }
201 
202   /* Column indices are in the list of free space */
203   /* Allocate space for cj, initialize cj, and */
204   /* destroy list of free space and other temporary array(s) */
205   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
206   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
207   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
208 
209   /* put together the new symbolic matrix */
210   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
211   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
212 
213   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
214   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
215   c          = (Mat_SeqAIJ*)(C->data);
216   c->free_a  = PETSC_FALSE;
217   c->free_ij = PETSC_TRUE;
218   c->nonew   = 0;
219 
220   /* fast, needs non-scalable O(bn) array 'abdense' */
221   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
222 
223   /* set MatInfo */
224   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
225   if (afill < 1.0) afill = 1.0;
226   c->maxnz                  = ci[am];
227   c->nz                     = ci[am];
228   C->info.mallocs           = ndouble;
229   C->info.fill_ratio_given  = fill;
230   C->info.fill_ratio_needed = afill;
231 
232 #if defined(PETSC_USE_INFO)
233   if (ci[am]) {
234     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
235     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
236   } else {
237     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
238   }
239 #endif
240   PetscFunctionReturn(0);
241 }
242 
243 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
244 {
245   PetscErrorCode ierr;
246   PetscLogDouble flops=0.0;
247   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
248   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
249   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
250   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
251   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
252   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
253   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
254   PetscScalar    *ab_dense;
255   PetscContainer cab_dense;
256 
257   PetscFunctionBegin;
258   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
259     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
260     c->a      = ca;
261     c->free_a = PETSC_TRUE;
262   } else ca = c->a;
263 
264   /* TODO this should be done in the symbolic phase */
265   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
266      that is hard to eradicate) */
267   ierr = PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);CHKERRQ(ierr);
268   if (!cab_dense) {
269     ierr = PetscMalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
270     ierr = PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);CHKERRQ(ierr);
271     ierr = PetscContainerSetPointer(cab_dense,ab_dense);CHKERRQ(ierr);
272     ierr = PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);CHKERRQ(ierr);
273     ierr = PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);CHKERRQ(ierr);
274     ierr = PetscObjectDereference((PetscObject)cab_dense);CHKERRQ(ierr);
275   }
276   ierr = PetscContainerGetPointer(cab_dense,(void**)&ab_dense);CHKERRQ(ierr);
277   ierr = PetscArrayzero(ab_dense,B->cmap->N);CHKERRQ(ierr);
278 
279   /* clean old values in C */
280   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
281   /* Traverse A row-wise. */
282   /* Build the ith row in C by summing over nonzero columns in A, */
283   /* the rows of B corresponding to nonzeros of A. */
284   for (i=0; i<am; i++) {
285     anzi = ai[i+1] - ai[i];
286     for (j=0; j<anzi; j++) {
287       brow = aj[j];
288       bnzi = bi[brow+1] - bi[brow];
289       bjj  = bj + bi[brow];
290       baj  = ba + bi[brow];
291       /* perform dense axpy */
292       valtmp = aa[j];
293       for (k=0; k<bnzi; k++) {
294         ab_dense[bjj[k]] += valtmp*baj[k];
295       }
296       flops += 2*bnzi;
297     }
298     aj += anzi; aa += anzi;
299 
300     cnzi = ci[i+1] - ci[i];
301     for (k=0; k<cnzi; k++) {
302       ca[k]          += ab_dense[cj[k]];
303       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
304     }
305     flops += cnzi;
306     cj    += cnzi; ca += cnzi;
307   }
308   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
309   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
311   PetscFunctionReturn(0);
312 }
313 
314 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
315 {
316   PetscErrorCode ierr;
317   PetscLogDouble flops=0.0;
318   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
319   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
320   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
321   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
322   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
323   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
324   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
325   PetscInt       nextb;
326 
327   PetscFunctionBegin;
328   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
329     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
330     c->a      = ca;
331     c->free_a = PETSC_TRUE;
332   }
333 
334   /* clean old values in C */
335   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
336   /* Traverse A row-wise. */
337   /* Build the ith row in C by summing over nonzero columns in A, */
338   /* the rows of B corresponding to nonzeros of A. */
339   for (i=0; i<am; i++) {
340     anzi = ai[i+1] - ai[i];
341     cnzi = ci[i+1] - ci[i];
342     for (j=0; j<anzi; j++) {
343       brow = aj[j];
344       bnzi = bi[brow+1] - bi[brow];
345       bjj  = bj + bi[brow];
346       baj  = ba + bi[brow];
347       /* perform sparse axpy */
348       valtmp = aa[j];
349       nextb  = 0;
350       for (k=0; nextb<bnzi; k++) {
351         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
352           ca[k] += valtmp*baj[nextb++];
353         }
354       }
355       flops += 2*bnzi;
356     }
357     aj += anzi; aa += anzi;
358     cj += cnzi; ca += cnzi;
359   }
360 
361   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
362   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
363   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 
367 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
368 {
369   PetscErrorCode     ierr;
370   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
371   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
372   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
373   MatScalar          *ca;
374   PetscReal          afill;
375   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
376   PetscTable         ta;
377   PetscFreeSpaceList free_space=NULL,current_space=NULL;
378 
379   PetscFunctionBegin;
380   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
381   /*-----------------------------------------------------------------------------------------*/
382   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
383   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
384   ci[0] = 0;
385 
386   /* create and initialize a linked list */
387   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
388   MatRowMergeMax_SeqAIJ(b,bm,ta);
389   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
390   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
391 
392   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
393 
394   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
395   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
396   current_space = free_space;
397 
398   /* Determine ci and cj */
399   for (i=0; i<am; i++) {
400     anzi = ai[i+1] - ai[i];
401     aj   = a->j + ai[i];
402     for (j=0; j<anzi; j++) {
403       brow = aj[j];
404       bnzj = bi[brow+1] - bi[brow];
405       bj   = b->j + bi[brow];
406       /* add non-zero cols of B into the sorted linked list lnk */
407       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
408     }
409     cnzi = lnk[1];
410 
411     /* If free space is not available, make more free space */
412     /* Double the amount of total space in the list */
413     if (current_space->local_remaining<cnzi) {
414       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
415       ndouble++;
416     }
417 
418     /* Copy data into free space, then initialize lnk */
419     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
420 
421     current_space->array           += cnzi;
422     current_space->local_used      += cnzi;
423     current_space->local_remaining -= cnzi;
424 
425     ci[i+1] = ci[i] + cnzi;
426   }
427 
428   /* Column indices are in the list of free space */
429   /* Allocate space for cj, initialize cj, and */
430   /* destroy list of free space and other temporary array(s) */
431   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
432   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
433   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
434 
435   /* Allocate space for ca */
436   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
437 
438   /* put together the new symbolic matrix */
439   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
440   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
441 
442   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
443   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
444   c          = (Mat_SeqAIJ*)(C->data);
445   c->free_a  = PETSC_TRUE;
446   c->free_ij = PETSC_TRUE;
447   c->nonew   = 0;
448 
449   /* slower, less memory */
450   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
451 
452   /* set MatInfo */
453   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
454   if (afill < 1.0) afill = 1.0;
455   c->maxnz                     = ci[am];
456   c->nz                        = ci[am];
457   C->info.mallocs           = ndouble;
458   C->info.fill_ratio_given  = fill;
459   C->info.fill_ratio_needed = afill;
460 
461 #if defined(PETSC_USE_INFO)
462   if (ci[am]) {
463     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
464     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
465   } else {
466     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
467   }
468 #endif
469   PetscFunctionReturn(0);
470 }
471 
472 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
473 {
474   PetscErrorCode     ierr;
475   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
476   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
477   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
478   MatScalar          *ca;
479   PetscReal          afill;
480   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
481   PetscTable         ta;
482   PetscFreeSpaceList free_space=NULL,current_space=NULL;
483 
484   PetscFunctionBegin;
485   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
486   /*---------------------------------------------------------------------------------------------*/
487   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
488   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
489   ci[0] = 0;
490 
491   /* create and initialize a linked list */
492   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
493   MatRowMergeMax_SeqAIJ(b,bm,ta);
494   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
495   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
496   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
497 
498   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
499   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
500   current_space = free_space;
501 
502   /* Determine ci and cj */
503   for (i=0; i<am; i++) {
504     anzi = ai[i+1] - ai[i];
505     aj   = a->j + ai[i];
506     for (j=0; j<anzi; j++) {
507       brow = aj[j];
508       bnzj = bi[brow+1] - bi[brow];
509       bj   = b->j + bi[brow];
510       /* add non-zero cols of B into the sorted linked list lnk */
511       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
512     }
513     cnzi = lnk[0];
514 
515     /* If free space is not available, make more free space */
516     /* Double the amount of total space in the list */
517     if (current_space->local_remaining<cnzi) {
518       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
519       ndouble++;
520     }
521 
522     /* Copy data into free space, then initialize lnk */
523     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
524 
525     current_space->array           += cnzi;
526     current_space->local_used      += cnzi;
527     current_space->local_remaining -= cnzi;
528 
529     ci[i+1] = ci[i] + cnzi;
530   }
531 
532   /* Column indices are in the list of free space */
533   /* Allocate space for cj, initialize cj, and */
534   /* destroy list of free space and other temporary array(s) */
535   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
536   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
537   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
538 
539   /* Allocate space for ca */
540   /*-----------------------*/
541   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
542 
543   /* put together the new symbolic matrix */
544   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
545   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
546 
547   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
548   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
549   c          = (Mat_SeqAIJ*)(C->data);
550   c->free_a  = PETSC_TRUE;
551   c->free_ij = PETSC_TRUE;
552   c->nonew   = 0;
553 
554   /* slower, less memory */
555   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
556 
557   /* set MatInfo */
558   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
559   if (afill < 1.0) afill = 1.0;
560   c->maxnz                     = ci[am];
561   c->nz                        = ci[am];
562   C->info.mallocs           = ndouble;
563   C->info.fill_ratio_given  = fill;
564   C->info.fill_ratio_needed = afill;
565 
566 #if defined(PETSC_USE_INFO)
567   if (ci[am]) {
568     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
569     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
570   } else {
571     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
572   }
573 #endif
574   PetscFunctionReturn(0);
575 }
576 
577 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
578 {
579   PetscErrorCode     ierr;
580   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
581   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
582   PetscInt           *ci,*cj,*bb;
583   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
584   PetscReal          afill;
585   PetscInt           i,j,col,ndouble = 0;
586   PetscFreeSpaceList free_space=NULL,current_space=NULL;
587   PetscHeap          h;
588 
589   PetscFunctionBegin;
590   /* Get ci and cj - by merging sorted rows using a heap */
591   /*---------------------------------------------------------------------------------------------*/
592   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
593   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
594   ci[0] = 0;
595 
596   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
597   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
598   current_space = free_space;
599 
600   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
601   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
602 
603   /* Determine ci and cj */
604   for (i=0; i<am; i++) {
605     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
606     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
607     ci[i+1] = ci[i];
608     /* Populate the min heap */
609     for (j=0; j<anzi; j++) {
610       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
611       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
612         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
613       }
614     }
615     /* Pick off the min element, adding it to free space */
616     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
617     while (j >= 0) {
618       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
619         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
620         ndouble++;
621       }
622       *(current_space->array++) = col;
623       current_space->local_used++;
624       current_space->local_remaining--;
625       ci[i+1]++;
626 
627       /* stash if anything else remains in this row of B */
628       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
629       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
630         PetscInt j2,col2;
631         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
632         if (col2 != col) break;
633         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
634         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
635       }
636       /* Put any stashed elements back into the min heap */
637       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
638       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
639     }
640   }
641   ierr = PetscFree(bb);CHKERRQ(ierr);
642   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
643 
644   /* Column indices are in the list of free space */
645   /* Allocate space for cj, initialize cj, and */
646   /* destroy list of free space and other temporary array(s) */
647   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
648   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
649 
650   /* put together the new symbolic matrix */
651   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
652   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
653 
654   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
655   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
656   c          = (Mat_SeqAIJ*)(C->data);
657   c->free_a  = PETSC_TRUE;
658   c->free_ij = PETSC_TRUE;
659   c->nonew   = 0;
660 
661   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
662 
663   /* set MatInfo */
664   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
665   if (afill < 1.0) afill = 1.0;
666   c->maxnz                     = ci[am];
667   c->nz                        = ci[am];
668   C->info.mallocs           = ndouble;
669   C->info.fill_ratio_given  = fill;
670   C->info.fill_ratio_needed = afill;
671 
672 #if defined(PETSC_USE_INFO)
673   if (ci[am]) {
674     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
675     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
676   } else {
677     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
678   }
679 #endif
680   PetscFunctionReturn(0);
681 }
682 
683 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
684 {
685   PetscErrorCode     ierr;
686   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
687   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
688   PetscInt           *ci,*cj,*bb;
689   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
690   PetscReal          afill;
691   PetscInt           i,j,col,ndouble = 0;
692   PetscFreeSpaceList free_space=NULL,current_space=NULL;
693   PetscHeap          h;
694   PetscBT            bt;
695 
696   PetscFunctionBegin;
697   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
698   /*---------------------------------------------------------------------------------------------*/
699   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
700   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
701   ci[0] = 0;
702 
703   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
704   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
705 
706   current_space = free_space;
707 
708   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
709   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
710   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
711 
712   /* Determine ci and cj */
713   for (i=0; i<am; i++) {
714     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
715     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
716     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
717     ci[i+1] = ci[i];
718     /* Populate the min heap */
719     for (j=0; j<anzi; j++) {
720       PetscInt brow = acol[j];
721       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
722         PetscInt bcol = bj[bb[j]];
723         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
724           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
725           bb[j]++;
726           break;
727         }
728       }
729     }
730     /* Pick off the min element, adding it to free space */
731     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
732     while (j >= 0) {
733       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
734         fptr = NULL;                      /* need PetscBTMemzero */
735         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
736         ndouble++;
737       }
738       *(current_space->array++) = col;
739       current_space->local_used++;
740       current_space->local_remaining--;
741       ci[i+1]++;
742 
743       /* stash if anything else remains in this row of B */
744       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
745         PetscInt bcol = bj[bb[j]];
746         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
747           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
748           bb[j]++;
749           break;
750         }
751       }
752       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
753     }
754     if (fptr) {                 /* Clear the bits for this row */
755       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
756     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
757       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
758     }
759   }
760   ierr = PetscFree(bb);CHKERRQ(ierr);
761   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
762   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
763 
764   /* Column indices are in the list of free space */
765   /* Allocate space for cj, initialize cj, and */
766   /* destroy list of free space and other temporary array(s) */
767   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
768   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
769 
770   /* put together the new symbolic matrix */
771   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
772   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
773 
774   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
775   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
776   c          = (Mat_SeqAIJ*)(C->data);
777   c->free_a  = PETSC_TRUE;
778   c->free_ij = PETSC_TRUE;
779   c->nonew   = 0;
780 
781   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
782 
783   /* set MatInfo */
784   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
785   if (afill < 1.0) afill = 1.0;
786   c->maxnz                     = ci[am];
787   c->nz                        = ci[am];
788   C->info.mallocs           = ndouble;
789   C->info.fill_ratio_given  = fill;
790   C->info.fill_ratio_needed = afill;
791 
792 #if defined(PETSC_USE_INFO)
793   if (ci[am]) {
794     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
795     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
796   } else {
797     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
798   }
799 #endif
800   PetscFunctionReturn(0);
801 }
802 
803 
804 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
805 {
806   PetscErrorCode     ierr;
807   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
808   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
809   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
810   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
811   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
812   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
813   const PetscInt     *brow_ptr[8],*brow_end[8];
814   PetscInt           window[8];
815   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
816   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
817   PetscReal          afill;
818   PetscInt           *workj_L1,*workj_L2,*workj_L3;
819   PetscInt           L1_nnz,L2_nnz;
820 
821   /* Step 1: Get upper bound on memory required for allocation.
822              Because of the way virtual memory works,
823              only the memory pages that are actually needed will be physically allocated. */
824   PetscFunctionBegin;
825   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
826   for (i=0; i<am; i++) {
827     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
828     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
829     a_rownnz = 0;
830     for (k=0; k<anzi; ++k) {
831       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
832       if (a_rownnz > bn) {
833         a_rownnz = bn;
834         break;
835       }
836     }
837     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
838   }
839   /* temporary work areas for merging rows */
840   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
841   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
842   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
843 
844   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
845   c_maxmem = 8*(ai[am]+bi[bm]);
846   /* Step 2: Populate pattern for C */
847   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
848 
849   ci_nnz       = 0;
850   ci[0]        = 0;
851   worki_L1[0]  = 0;
852   worki_L2[0]  = 0;
853   for (i=0; i<am; i++) {
854     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
855     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
856     rowsleft             = anzi;
857     inputcol_L1          = acol;
858     L2_nnz               = 0;
859     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
860     worki_L2[1]          = 0;
861     outputi_nnz          = 0;
862 
863     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
864     while (ci_nnz+a_maxrownnz > c_maxmem) {
865       c_maxmem *= 2;
866       ndouble++;
867       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
868     }
869 
870     while (rowsleft) {
871       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
872       L1_nrows    = 0;
873       L1_nnz      = 0;
874       inputcol    = inputcol_L1;
875       inputi      = bi;
876       inputj      = bj;
877 
878       /* The following macro is used to specialize for small rows in A.
879          This helps with compiler unrolling, improving performance substantially.
880           Input:  inputj   inputi  inputcol  bn
881           Output: outputj  outputi_nnz                       */
882        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
883          window_min  = bn;                                                   \
884          outputi_nnz = 0;                                                    \
885          for (k=0; k<ANNZ; ++k) {                                            \
886            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
887            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
888            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
889            window_min  = PetscMin(window[k], window_min);                    \
890          }                                                                   \
891          while (window_min < bn) {                                           \
892            outputj[outputi_nnz++] = window_min;                              \
893            /* advance front and compute new minimum */                       \
894            old_window_min = window_min;                                      \
895            window_min = bn;                                                  \
896            for (k=0; k<ANNZ; ++k) {                                          \
897              if (window[k] == old_window_min) {                              \
898                brow_ptr[k]++;                                                \
899                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900              }                                                               \
901              window_min = PetscMin(window[k], window_min);                   \
902            }                                                                 \
903          }
904 
905       /************** L E V E L  1 ***************/
906       /* Merge up to 8 rows of B to L1 work array*/
907       while (L1_rowsleft) {
908         outputi_nnz = 0;
909         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
910         else           outputj = cj + ci_nnz;           /* Merge directly to C */
911 
912         switch (L1_rowsleft) {
913         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
914                  brow_end[0] = inputj + inputi[inputcol[0]+1];
915                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
916                  inputcol    += L1_rowsleft;
917                  rowsleft    -= L1_rowsleft;
918                  L1_rowsleft  = 0;
919                  break;
920         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
921                  inputcol    += L1_rowsleft;
922                  rowsleft    -= L1_rowsleft;
923                  L1_rowsleft  = 0;
924                  break;
925         case 3: MatMatMultSymbolic_RowMergeMacro(3);
926                  inputcol    += L1_rowsleft;
927                  rowsleft    -= L1_rowsleft;
928                  L1_rowsleft  = 0;
929                  break;
930         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
931                  inputcol    += L1_rowsleft;
932                  rowsleft    -= L1_rowsleft;
933                  L1_rowsleft  = 0;
934                  break;
935         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
936                  inputcol    += L1_rowsleft;
937                  rowsleft    -= L1_rowsleft;
938                  L1_rowsleft  = 0;
939                  break;
940         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
941                  inputcol    += L1_rowsleft;
942                  rowsleft    -= L1_rowsleft;
943                  L1_rowsleft  = 0;
944                  break;
945         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
946                  inputcol    += L1_rowsleft;
947                  rowsleft    -= L1_rowsleft;
948                  L1_rowsleft  = 0;
949                  break;
950         default: MatMatMultSymbolic_RowMergeMacro(8);
951                  inputcol    += 8;
952                  rowsleft    -= 8;
953                  L1_rowsleft -= 8;
954                  break;
955         }
956         inputcol_L1           = inputcol;
957         L1_nnz               += outputi_nnz;
958         worki_L1[++L1_nrows]  = L1_nnz;
959       }
960 
961       /********************** L E V E L  2 ************************/
962       /* Merge from L1 work array to either C or to L2 work array */
963       if (anzi > 8) {
964         inputi      = worki_L1;
965         inputj      = workj_L1;
966         inputcol    = workcol;
967         outputi_nnz = 0;
968 
969         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
970         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
971 
972         switch (L1_nrows) {
973         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
974                  brow_end[0] = inputj + inputi[inputcol[0]+1];
975                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
976                  break;
977         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
978         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
979         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
980         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
981         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
982         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
983         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
984         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
985         }
986         L2_nnz               += outputi_nnz;
987         worki_L2[++L2_nrows]  = L2_nnz;
988 
989         /************************ L E V E L  3 **********************/
990         /* Merge from L2 work array to either C or to L2 work array */
991         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
992           inputi      = worki_L2;
993           inputj      = workj_L2;
994           inputcol    = workcol;
995           outputi_nnz = 0;
996           if (rowsleft) outputj = workj_L3;
997           else          outputj = cj + ci_nnz;
998           switch (L2_nrows) {
999           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1000                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1001                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1002                    break;
1003           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1004           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1005           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1006           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1007           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1008           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1009           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1010           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1011           }
1012           L2_nrows    = 1;
1013           L2_nnz      = outputi_nnz;
1014           worki_L2[1] = outputi_nnz;
1015           /* Copy to workj_L2 */
1016           if (rowsleft) {
1017             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1018           }
1019         }
1020       }
1021     }  /* while (rowsleft) */
1022 #undef MatMatMultSymbolic_RowMergeMacro
1023 
1024     /* terminate current row */
1025     ci_nnz += outputi_nnz;
1026     ci[i+1] = ci_nnz;
1027   }
1028 
1029   /* Step 3: Create the new symbolic matrix */
1030   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1031   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1032 
1033   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1034   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1035   c          = (Mat_SeqAIJ*)(C->data);
1036   c->free_a  = PETSC_TRUE;
1037   c->free_ij = PETSC_TRUE;
1038   c->nonew   = 0;
1039 
1040   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1041 
1042   /* set MatInfo */
1043   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1044   if (afill < 1.0) afill = 1.0;
1045   c->maxnz                     = ci[am];
1046   c->nz                        = ci[am];
1047   C->info.mallocs           = ndouble;
1048   C->info.fill_ratio_given  = fill;
1049   C->info.fill_ratio_needed = afill;
1050 
1051 #if defined(PETSC_USE_INFO)
1052   if (ci[am]) {
1053     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1054     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1055   } else {
1056     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1057   }
1058 #endif
1059 
1060   /* Step 4: Free temporary work areas */
1061   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1062   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1063   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 /* concatenate unique entries and then sort */
1068 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1069 {
1070   PetscErrorCode ierr;
1071   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1072   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1073   PetscInt       *ci,*cj;
1074   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1075   PetscReal      afill;
1076   PetscInt       i,j,ndouble = 0;
1077   PetscSegBuffer seg,segrow;
1078   char           *seen;
1079 
1080   PetscFunctionBegin;
1081   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1082   ci[0] = 0;
1083 
1084   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1085   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1086   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1087   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1088 
1089   /* Determine ci and cj */
1090   for (i=0; i<am; i++) {
1091     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1092     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1093     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1094     /* Pack segrow */
1095     for (j=0; j<anzi; j++) {
1096       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1097       for (k=bjstart; k<bjend; k++) {
1098         PetscInt bcol = bj[k];
1099         if (!seen[bcol]) { /* new entry */
1100           PetscInt *PETSC_RESTRICT slot;
1101           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1102           *slot = bcol;
1103           seen[bcol] = 1;
1104           packlen++;
1105         }
1106       }
1107     }
1108     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1109     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1110     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1111     ci[i+1] = ci[i] + packlen;
1112     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1113   }
1114   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1115   ierr = PetscFree(seen);CHKERRQ(ierr);
1116 
1117   /* Column indices are in the segmented buffer */
1118   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1119   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1120 
1121   /* put together the new symbolic matrix */
1122   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1123   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1124 
1125   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1126   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1127   c          = (Mat_SeqAIJ*)(C->data);
1128   c->free_a  = PETSC_TRUE;
1129   c->free_ij = PETSC_TRUE;
1130   c->nonew   = 0;
1131 
1132   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1133 
1134   /* set MatInfo */
1135   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1136   if (afill < 1.0) afill = 1.0;
1137   c->maxnz                     = ci[am];
1138   c->nz                        = ci[am];
1139   C->info.mallocs           = ndouble;
1140   C->info.fill_ratio_given  = fill;
1141   C->info.fill_ratio_needed = afill;
1142 
1143 #if defined(PETSC_USE_INFO)
1144   if (ci[am]) {
1145     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1146     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1147   } else {
1148     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1149   }
1150 #endif
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1155 {
1156   PetscErrorCode      ierr;
1157   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1158 
1159   PetscFunctionBegin;
1160   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
1161   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
1162   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
1163   ierr = PetscFree(abt);CHKERRQ(ierr);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1168 {
1169   PetscErrorCode      ierr;
1170   Mat                 Bt;
1171   PetscInt            *bti,*btj;
1172   Mat_MatMatTransMult *abt;
1173   Mat_Product         *product = C->product;
1174   char                *alg;
1175 
1176   PetscFunctionBegin;
1177   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1178   if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1179 
1180   /* create symbolic Bt */
1181   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1182   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
1183   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1184   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
1185 
1186   /* get symbolic C=A*Bt */
1187   ierr = PetscStrallocpy(product->alg,&alg);CHKERRQ(ierr);
1188   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
1189   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
1190   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
1191   ierr = PetscFree(alg);CHKERRQ(ierr);
1192 
1193   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1194   ierr = PetscNew(&abt);CHKERRQ(ierr);
1195 
1196   product->data    = abt;
1197   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1198 
1199   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1200 
1201   abt->usecoloring = PETSC_FALSE;
1202   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
1203   if (abt->usecoloring) {
1204     /* Create MatTransposeColoring from symbolic C=A*B^T */
1205     MatTransposeColoring matcoloring;
1206     MatColoring          coloring;
1207     ISColoring           iscoloring;
1208     Mat                  Bt_dense,C_dense;
1209 
1210     /* inode causes memory problem */
1211     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
1212 
1213     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1214     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1215     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1216     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1217     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1218     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1219     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
1220 
1221     abt->matcoloring = matcoloring;
1222 
1223     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
1224 
1225     /* Create Bt_dense and C_dense = A*Bt_dense */
1226     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
1227     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1228     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
1229     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
1230 
1231     Bt_dense->assembled = PETSC_TRUE;
1232     abt->Bt_den         = Bt_dense;
1233 
1234     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
1235     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1236     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
1237     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
1238 
1239     Bt_dense->assembled = PETSC_TRUE;
1240     abt->ABt_den  = C_dense;
1241 
1242 #if defined(PETSC_USE_INFO)
1243     {
1244       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1245       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
1246     }
1247 #endif
1248   }
1249   /* clean up */
1250   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1251   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1256 {
1257   PetscErrorCode      ierr;
1258   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1259   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1260   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1261   PetscLogDouble      flops=0.0;
1262   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1263   Mat_MatMatTransMult *abt;
1264   Mat_Product         *product = C->product;
1265 
1266   PetscFunctionBegin;
1267   if (!product) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1268   abt = (Mat_MatMatTransMult *)product->data;
1269   if (!abt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1270   /* clear old values in C */
1271   if (!c->a) {
1272     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1273     c->a      = ca;
1274     c->free_a = PETSC_TRUE;
1275   } else {
1276     ca =  c->a;
1277     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
1278   }
1279 
1280   if (abt->usecoloring) {
1281     MatTransposeColoring matcoloring = abt->matcoloring;
1282     Mat                  Bt_dense,C_dense = abt->ABt_den;
1283 
1284     /* Get Bt_dense by Apply MatTransposeColoring to B */
1285     Bt_dense = abt->Bt_den;
1286     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1287 
1288     /* C_dense = A*Bt_dense */
1289     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1290 
1291     /* Recover C from C_dense */
1292     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1293     PetscFunctionReturn(0);
1294   }
1295 
1296   for (i=0; i<cm; i++) {
1297     anzi = ai[i+1] - ai[i];
1298     acol = aj + ai[i];
1299     aval = aa + ai[i];
1300     cnzi = ci[i+1] - ci[i];
1301     ccol = cj + ci[i];
1302     cval = ca + ci[i];
1303     for (j=0; j<cnzi; j++) {
1304       brow = ccol[j];
1305       bnzj = bi[brow+1] - bi[brow];
1306       bcol = bj + bi[brow];
1307       bval = ba + bi[brow];
1308 
1309       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1310       nexta = 0; nextb = 0;
1311       while (nexta<anzi && nextb<bnzj) {
1312         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1313         if (nexta == anzi) break;
1314         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1315         if (nextb == bnzj) break;
1316         if (acol[nexta] == bcol[nextb]) {
1317           cval[j] += aval[nexta]*bval[nextb];
1318           nexta++; nextb++;
1319           flops += 2;
1320         }
1321       }
1322     }
1323   }
1324   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1325   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1326   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1327   PetscFunctionReturn(0);
1328 }
1329 
1330 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1331 {
1332   PetscErrorCode      ierr;
1333   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1334 
1335   PetscFunctionBegin;
1336   ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
1337   if (atb->destroy) {
1338     ierr = (*atb->destroy)(atb->data);CHKERRQ(ierr);
1339   }
1340   ierr = PetscFree(atb);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1345 {
1346   PetscErrorCode      ierr;
1347   Mat                 At;
1348   PetscInt            *ati,*atj;
1349   Mat_Product         *product = C->product;
1350   MatProductAlgorithm alg;
1351   PetscBool           flg;
1352 
1353   PetscFunctionBegin;
1354   if (product) {
1355     alg = product->alg;
1356   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
1357 
1358   /* outerproduct */
1359   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
1360   if (flg) {
1361     /* create symbolic At */
1362     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1363     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1364     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1365     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1366 
1367     /* get symbolic C=At*B */
1368     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1369     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1370 
1371     /* clean up */
1372     ierr = MatDestroy(&At);CHKERRQ(ierr);
1373     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1374 
1375     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1376     ierr = MatProductSetAlgorithm(C,"outerproduct");CHKERRQ(ierr);
1377     PetscFunctionReturn(0);
1378   }
1379 
1380   /* matmatmult */
1381   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
1382   if (flg) {
1383     Mat_MatTransMatMult *atb;
1384 
1385     if (product->data) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1386     ierr = PetscNew(&atb);CHKERRQ(ierr);
1387     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1388     ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr);
1389     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1390     ierr = MatProductSetAlgorithm(C,"at*b");CHKERRQ(ierr);
1391     product->data    = atb;
1392     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1393     atb->At          = At;
1394     atb->updateAt    = PETSC_FALSE; /* because At is computed here */
1395 
1396     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1397     PetscFunctionReturn(0);
1398   }
1399 
1400   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1405 {
1406   PetscErrorCode ierr;
1407   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1408   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1409   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1410   PetscLogDouble flops=0.0;
1411   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1412 
1413   PetscFunctionBegin;
1414   if (!c->a) {
1415     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1416 
1417     c->a      = ca;
1418     c->free_a = PETSC_TRUE;
1419   } else {
1420     ca   = c->a;
1421     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1422   }
1423 
1424   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1425   for (i=0; i<am; i++) {
1426     bj   = b->j + bi[i];
1427     ba   = b->a + bi[i];
1428     bnzi = bi[i+1] - bi[i];
1429     anzi = ai[i+1] - ai[i];
1430     for (j=0; j<anzi; j++) {
1431       nextb = 0;
1432       crow  = *aj++;
1433       cjj   = cj + ci[crow];
1434       caj   = ca + ci[crow];
1435       /* perform sparse axpy operation.  Note cjj includes bj. */
1436       for (k=0; nextb<bnzi; k++) {
1437         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1438           caj[k] += (*aa)*(*(ba+nextb));
1439           nextb++;
1440         }
1441       }
1442       flops += 2*bnzi;
1443       aa++;
1444     }
1445   }
1446 
1447   /* Assemble the final matrix and clean up */
1448   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1449   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1450   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1455 {
1456   PetscErrorCode ierr;
1457 
1458   PetscFunctionBegin;
1459   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1460 
1461   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1466 {
1467   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1468   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
1469   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
1470   PetscErrorCode    ierr;
1471   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1472   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1473   const PetscInt    *aj;
1474   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1475   PetscInt          clda=cd->lda;
1476   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
1477 
1478   PetscFunctionBegin;
1479   if (!cm || !cn) PetscFunctionReturn(0);
1480   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
1481   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1482   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1483   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1484   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1485   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1486     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1487       r1 = r2 = r3 = r4 = 0.0;
1488       n  = a->i[i+1] - a->i[i];
1489       aj = a->j + a->i[i];
1490       aa = av + a->i[i];
1491       for (j=0; j<n; j++) {
1492         const PetscScalar aatmp = aa[j];
1493         const PetscInt    ajtmp = aj[j];
1494         r1 += aatmp*b1[ajtmp];
1495         r2 += aatmp*b2[ajtmp];
1496         r3 += aatmp*b3[ajtmp];
1497         r4 += aatmp*b4[ajtmp];
1498       }
1499       c1[i] += r1;
1500       c2[i] += r2;
1501       c3[i] += r3;
1502       c4[i] += r4;
1503     }
1504     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1505     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1506   }
1507   for (; col<cn; col++) {   /* over extra columns of C */
1508     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1509       r1 = 0.0;
1510       n  = a->i[i+1] - a->i[i];
1511       aj = a->j + a->i[i];
1512       aa = av + a->i[i];
1513       for (j=0; j<n; j++) {
1514         r1 += aa[j]*b1[aj[j]];
1515       }
1516       c1[i] += r1;
1517     }
1518     b1 += bm;
1519     c1 += clda;
1520   }
1521   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1522   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1523   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1524   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1525   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1526   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1531 {
1532   PetscErrorCode ierr;
1533 
1534   PetscFunctionBegin;
1535   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1536   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1537   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1538 
1539   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1540   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 /* ------------------------------------------------------- */
1545 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1546 {
1547   PetscFunctionBegin;
1548   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1549   C->ops->productsymbolic = MatProductSymbolic_AB;
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1554 
1555 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1556 {
1557   PetscFunctionBegin;
1558   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1559   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1564 {
1565   PetscFunctionBegin;
1566   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1567   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1572 {
1573   PetscErrorCode ierr;
1574   Mat_Product    *product = C->product;
1575 
1576   PetscFunctionBegin;
1577   switch (product->type) {
1578   case MATPRODUCT_AB:
1579     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1580     break;
1581   case MATPRODUCT_AtB:
1582     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
1583     break;
1584   case MATPRODUCT_ABt:
1585     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);CHKERRQ(ierr);
1586     break;
1587   default:
1588     break;
1589   }
1590   PetscFunctionReturn(0);
1591 }
1592 /* ------------------------------------------------------- */
1593 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1594 {
1595   PetscErrorCode ierr;
1596   Mat_Product    *product = C->product;
1597   Mat            A = product->A;
1598   PetscBool      baij;
1599 
1600   PetscFunctionBegin;
1601   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
1602   if (!baij) { /* A is seqsbaij */
1603     PetscBool sbaij;
1604     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
1605     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1606 
1607     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1608   } else { /* A is seqbaij */
1609     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1610   }
1611 
1612   C->ops->productsymbolic = MatProductSymbolic_AB;
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1617 {
1618   PetscErrorCode ierr;
1619   Mat_Product    *product = C->product;
1620 
1621   PetscFunctionBegin;
1622   MatCheckProduct(C,1);
1623   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1624   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
1625     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1626   }
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 /* ------------------------------------------------------- */
1631 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1632 {
1633   PetscFunctionBegin;
1634   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1635   C->ops->productsymbolic = MatProductSymbolic_AB;
1636   PetscFunctionReturn(0);
1637 }
1638 
1639 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1640 {
1641   PetscErrorCode ierr;
1642   Mat_Product    *product = C->product;
1643 
1644   PetscFunctionBegin;
1645   if (product->type == MATPRODUCT_AB) {
1646     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
1647   }
1648   PetscFunctionReturn(0);
1649 }
1650 /* ------------------------------------------------------- */
1651 
1652 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1653 {
1654   PetscErrorCode ierr;
1655   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1656   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1657   PetscInt       *bi      = b->i,*bj=b->j;
1658   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1659   MatScalar      *btval,*btval_den,*ba=b->a;
1660   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1661 
1662   PetscFunctionBegin;
1663   btval_den=btdense->v;
1664   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
1665   for (k=0; k<ncolors; k++) {
1666     ncolumns = coloring->ncolumns[k];
1667     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1668       col   = *(columns + colorforcol[k] + l);
1669       btcol = bj + bi[col];
1670       btval = ba + bi[col];
1671       anz   = bi[col+1] - bi[col];
1672       for (j=0; j<anz; j++) {
1673         brow            = btcol[j];
1674         btval_den[brow] = btval[j];
1675       }
1676     }
1677     btval_den += m;
1678   }
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1683 {
1684   PetscErrorCode    ierr;
1685   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1686   const PetscScalar *ca_den,*ca_den_ptr;
1687   PetscScalar       *ca=csp->a;
1688   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1689   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1690   PetscInt          nrows,*row,*idx;
1691   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1692 
1693   PetscFunctionBegin;
1694   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1695 
1696   if (brows > 0) {
1697     PetscInt *lstart,row_end,row_start;
1698     lstart = matcoloring->lstart;
1699     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1700 
1701     row_end = brows;
1702     if (row_end > m) row_end = m;
1703     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1704       ca_den_ptr = ca_den;
1705       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1706         nrows = matcoloring->nrows[k];
1707         row   = rows  + colorforrow[k];
1708         idx   = den2sp + colorforrow[k];
1709         for (l=lstart[k]; l<nrows; l++) {
1710           if (row[l] >= row_end) {
1711             lstart[k] = l;
1712             break;
1713           } else {
1714             ca[idx[l]] = ca_den_ptr[row[l]];
1715           }
1716         }
1717         ca_den_ptr += m;
1718       }
1719       row_end += brows;
1720       if (row_end > m) row_end = m;
1721     }
1722   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1723     ca_den_ptr = ca_den;
1724     for (k=0; k<ncolors; k++) {
1725       nrows = matcoloring->nrows[k];
1726       row   = rows  + colorforrow[k];
1727       idx   = den2sp + colorforrow[k];
1728       for (l=0; l<nrows; l++) {
1729         ca[idx[l]] = ca_den_ptr[row[l]];
1730       }
1731       ca_den_ptr += m;
1732     }
1733   }
1734 
1735   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1736 #if defined(PETSC_USE_INFO)
1737   if (matcoloring->brows > 0) {
1738     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1739   } else {
1740     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1741   }
1742 #endif
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1747 {
1748   PetscErrorCode ierr;
1749   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1750   const PetscInt *is,*ci,*cj,*row_idx;
1751   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1752   IS             *isa;
1753   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1754   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1755   PetscInt       *colorforcol,*columns,*columns_i,brows;
1756   PetscBool      flg;
1757 
1758   PetscFunctionBegin;
1759   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1760 
1761   /* bs >1 is not being tested yet! */
1762   Nbs       = mat->cmap->N/bs;
1763   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1764   c->N      = Nbs;
1765   c->m      = c->M;
1766   c->rstart = 0;
1767   c->brows  = 100;
1768 
1769   c->ncolors = nis;
1770   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1771   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1772   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1773 
1774   brows = c->brows;
1775   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1776   if (flg) c->brows = brows;
1777   if (brows > 0) {
1778     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1779   }
1780 
1781   colorforrow[0] = 0;
1782   rows_i         = rows;
1783   den2sp_i       = den2sp;
1784 
1785   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1786   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
1787 
1788   colorforcol[0] = 0;
1789   columns_i      = columns;
1790 
1791   /* get column-wise storage of mat */
1792   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1793 
1794   cm   = c->m;
1795   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1796   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1797   for (i=0; i<nis; i++) { /* loop over color */
1798     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1799     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1800 
1801     c->ncolumns[i] = n;
1802     if (n) {
1803       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1804     }
1805     colorforcol[i+1] = colorforcol[i] + n;
1806     columns_i       += n;
1807 
1808     /* fast, crude version requires O(N*N) work */
1809     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1810 
1811     for (j=0; j<n; j++) { /* loop over columns*/
1812       col     = is[j];
1813       row_idx = cj + ci[col];
1814       m       = ci[col+1] - ci[col];
1815       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1816         idxhit[*row_idx]   = spidx[ci[col] + k];
1817         rowhit[*row_idx++] = col + 1;
1818       }
1819     }
1820     /* count the number of hits */
1821     nrows = 0;
1822     for (j=0; j<cm; j++) {
1823       if (rowhit[j]) nrows++;
1824     }
1825     c->nrows[i]      = nrows;
1826     colorforrow[i+1] = colorforrow[i] + nrows;
1827 
1828     nrows = 0;
1829     for (j=0; j<cm; j++) { /* loop over rows */
1830       if (rowhit[j]) {
1831         rows_i[nrows]   = j;
1832         den2sp_i[nrows] = idxhit[j];
1833         nrows++;
1834       }
1835     }
1836     den2sp_i += nrows;
1837 
1838     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1839     rows_i += nrows;
1840   }
1841   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1842   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1843   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1844   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1845 
1846   c->colorforrow = colorforrow;
1847   c->rows        = rows;
1848   c->den2sp      = den2sp;
1849   c->colorforcol = colorforcol;
1850   c->columns     = columns;
1851 
1852   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1853   PetscFunctionReturn(0);
1854 }
1855 
1856 /* --------------------------------------------------------------- */
1857 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1858 {
1859   PetscErrorCode ierr;
1860   Mat_Product    *product = C->product;
1861   Mat            A=product->A,B=product->B;
1862 
1863   PetscFunctionBegin;
1864   if (C->ops->mattransposemultnumeric) {
1865     /* Alg: "outerproduct" */
1866     ierr = (*C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
1867   } else {
1868     /* Alg: "matmatmult" -- C = At*B */
1869     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1870     Mat                 At;
1871 
1872     if (!atb) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1873     At = atb->At;
1874     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1875       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
1876     }
1877     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
1878     atb->updateAt = PETSC_TRUE;
1879   }
1880   PetscFunctionReturn(0);
1881 }
1882 
1883 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1884 {
1885   PetscErrorCode ierr;
1886   Mat_Product    *product = C->product;
1887   Mat            A=product->A,B=product->B;
1888   PetscReal      fill=product->fill;
1889 
1890   PetscFunctionBegin;
1891   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1892 
1893   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1894   PetscFunctionReturn(0);
1895 }
1896 
1897 /* --------------------------------------------------------------- */
1898 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1899 {
1900   PetscErrorCode ierr;
1901   Mat_Product    *product = C->product;
1902   PetscInt       alg = 0; /* default algorithm */
1903   PetscBool      flg = PETSC_FALSE;
1904 #if !defined(PETSC_HAVE_HYPRE)
1905   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1906   PetscInt       nalg = 7;
1907 #else
1908   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1909   PetscInt       nalg = 8;
1910 #endif
1911 
1912   PetscFunctionBegin;
1913   /* Set default algorithm */
1914   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1915   if (flg) {
1916     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1917   }
1918 
1919   /* Get runtime option */
1920   if (product->api_user) {
1921     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1922     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1923     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1924   } else {
1925     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1926     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1927     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1928   }
1929   if (flg) {
1930     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1931   }
1932 
1933   C->ops->productsymbolic = MatProductSymbolic_AB;
1934   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1939 {
1940   PetscErrorCode ierr;
1941   Mat_Product    *product = C->product;
1942   PetscInt       alg = 0; /* default algorithm */
1943   PetscBool      flg = PETSC_FALSE;
1944   const char     *algTypes[2] = {"at*b","outerproduct"};
1945   PetscInt       nalg = 2;
1946 
1947   PetscFunctionBegin;
1948   /* Set default algorithm */
1949   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
1950   if (flg) {
1951     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1952   }
1953 
1954   /* Get runtime option */
1955   if (product->api_user) {
1956     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1957     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1958     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1959   } else {
1960     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1961     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1962     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1963   }
1964   if (flg) {
1965     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1966   }
1967 
1968   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
1973 {
1974   PetscErrorCode ierr;
1975   Mat_Product    *product = C->product;
1976   PetscInt       alg = 0; /* default algorithm */
1977   PetscBool      flg = PETSC_FALSE;
1978   const char     *algTypes[2] = {"default","color"};
1979   PetscInt       nalg = 2;
1980 
1981   PetscFunctionBegin;
1982   /* Set default algorithm */
1983   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1984   if (!flg) {
1985     alg = 1;
1986     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1987   }
1988 
1989   /* Get runtime option */
1990   if (product->api_user) {
1991     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
1992     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1993     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1994   } else {
1995     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
1996     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1997     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1998   }
1999   if (flg) {
2000     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2001   }
2002 
2003   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2004   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2005   PetscFunctionReturn(0);
2006 }
2007 
2008 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2009 {
2010   PetscErrorCode ierr;
2011   Mat_Product    *product = C->product;
2012   PetscBool      flg = PETSC_FALSE;
2013   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2014 #if !defined(PETSC_HAVE_HYPRE)
2015   const char      *algTypes[2] = {"scalable","rap"};
2016   PetscInt        nalg = 2;
2017 #else
2018   const char      *algTypes[3] = {"scalable","rap","hypre"};
2019   PetscInt        nalg = 3;
2020 #endif
2021 
2022   PetscFunctionBegin;
2023   /* Set default algorithm */
2024   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2025   if (flg) {
2026     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2027   }
2028 
2029   /* Get runtime option */
2030   if (product->api_user) {
2031     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2032     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2033     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2034   } else {
2035     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2036     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2037     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2038   }
2039   if (flg) {
2040     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2041   }
2042 
2043   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2048 {
2049   PetscErrorCode ierr;
2050   Mat_Product    *product = C->product;
2051   PetscBool      flg = PETSC_FALSE;
2052   PetscInt       alg = 0; /* default algorithm */
2053   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2054   PetscInt        nalg = 3;
2055 
2056   PetscFunctionBegin;
2057   /* Set default algorithm */
2058   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2059   if (flg) {
2060     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2061   }
2062 
2063   /* Get runtime option */
2064   if (product->api_user) {
2065     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2066     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2067     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2068   } else {
2069     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2070     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2071     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2072   }
2073   if (flg) {
2074     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2075   }
2076 
2077   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2078   PetscFunctionReturn(0);
2079 }
2080 
2081 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2082 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2083 {
2084   PetscErrorCode ierr;
2085   Mat_Product    *product = C->product;
2086   PetscInt       alg = 0; /* default algorithm */
2087   PetscBool      flg = PETSC_FALSE;
2088   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2089   PetscInt       nalg = 7;
2090 
2091   PetscFunctionBegin;
2092   /* Set default algorithm */
2093   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2094   if (flg) {
2095     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2096   }
2097 
2098   /* Get runtime option */
2099   if (product->api_user) {
2100     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2101     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2102     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2103   } else {
2104     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2105     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2106     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2107   }
2108   if (flg) {
2109     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2110   }
2111 
2112   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2113   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2118 {
2119   PetscErrorCode ierr;
2120   Mat_Product    *product = C->product;
2121 
2122   PetscFunctionBegin;
2123   switch (product->type) {
2124   case MATPRODUCT_AB:
2125     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
2126     break;
2127   case MATPRODUCT_AtB:
2128     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
2129     break;
2130   case MATPRODUCT_ABt:
2131     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
2132     break;
2133   case MATPRODUCT_PtAP:
2134     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
2135     break;
2136   case MATPRODUCT_RARt:
2137     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
2138     break;
2139   case MATPRODUCT_ABC:
2140     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
2141     break;
2142   default:
2143     break;
2144   }
2145   PetscFunctionReturn(0);
2146 }
2147