xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 967582eba84cc53dc1fbf6b54d6aa49b7d83bae6)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscErrorCode ierr;
16 
17   PetscFunctionBegin;
18   if (C->ops->matmultnumeric) {
19     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
20   } else {
21     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
22   }
23   PetscFunctionReturn(0);
24 }
25 
26 /* Modified from MatCreateSeqAIJWithArrays() */
27 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
28 {
29   PetscErrorCode ierr;
30   PetscInt       ii;
31   Mat_SeqAIJ     *aij;
32   PetscBool      isseqaij;
33 
34   PetscFunctionBegin;
35   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
36   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
37 
38   if (!mtype) {
39     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
40     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
41   } else {
42     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
43   }
44   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,0);CHKERRQ(ierr);
45   aij  = (Mat_SeqAIJ*)(mat)->data;
46   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
47   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
48 
49   aij->i            = i;
50   aij->j            = j;
51   aij->a            = a;
52   aij->singlemalloc = PETSC_FALSE;
53   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
54   aij->free_a       = PETSC_FALSE;
55   aij->free_ij      = PETSC_FALSE;
56 
57   for (ii=0; ii<m; ii++) {
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60 
61   PetscFunctionReturn(0);
62 }
63 
64 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
65 {
66   PetscErrorCode      ierr;
67   Mat_Product         *product = C->product;
68   MatProductAlgorithm alg;
69   PetscBool           flg;
70 
71   PetscFunctionBegin;
72   if (product) {
73     alg = product->alg;
74   } else {
75     alg = "sorted";
76   }
77   /* sorted */
78   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
79   if (flg) {
80     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
81     PetscFunctionReturn(0);
82   }
83 
84   /* scalable */
85   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
86   if (flg) {
87     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
88     PetscFunctionReturn(0);
89   }
90 
91   /* scalable_fast */
92   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
93   if (flg) {
94     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
95     PetscFunctionReturn(0);
96   }
97 
98   /* heap */
99   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
100   if (flg) {
101     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
102     PetscFunctionReturn(0);
103   }
104 
105   /* btheap */
106   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
107   if (flg) {
108     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
109     PetscFunctionReturn(0);
110   }
111 
112   /* llcondensed */
113   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
114   if (flg) {
115     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118 
119   /* rowmerge */
120   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
121   if (flg) {
122     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
123     PetscFunctionReturn(0);
124   }
125 
126 #if defined(PETSC_HAVE_HYPRE)
127   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
128   if (flg) {
129     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
130     PetscFunctionReturn(0);
131   }
132 #endif
133 
134   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
135   PetscFunctionReturn(0);
136 }
137 
138 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
139 {
140   PetscErrorCode     ierr;
141   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
142   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
143   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
144   PetscReal          afill;
145   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
146   PetscTable         ta;
147   PetscBT            lnkbt;
148   PetscFreeSpaceList free_space=NULL,current_space=NULL;
149 
150   PetscFunctionBegin;
151   /* Get ci and cj */
152   /*---------------*/
153   /* Allocate ci array, arrays for fill computation and */
154   /* free space for accumulating nonzero column info */
155   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
156   ci[0] = 0;
157 
158   /* create and initialize a linked list */
159   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
160   MatRowMergeMax_SeqAIJ(b,bm,ta);
161   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
162   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
163 
164   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
165 
166   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
167   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
168 
169   current_space = free_space;
170 
171   /* Determine ci and cj */
172   for (i=0; i<am; i++) {
173     anzi = ai[i+1] - ai[i];
174     aj   = a->j + ai[i];
175     for (j=0; j<anzi; j++) {
176       brow = aj[j];
177       bnzj = bi[brow+1] - bi[brow];
178       bj   = b->j + bi[brow];
179       /* add non-zero cols of B into the sorted linked list lnk */
180       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
181     }
182     cnzi = lnk[0];
183 
184     /* If free space is not available, make more free space */
185     /* Double the amount of total space in the list */
186     if (current_space->local_remaining<cnzi) {
187       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
188       ndouble++;
189     }
190 
191     /* Copy data into free space, then initialize lnk */
192     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
193 
194     current_space->array           += cnzi;
195     current_space->local_used      += cnzi;
196     current_space->local_remaining -= cnzi;
197 
198     ci[i+1] = ci[i] + cnzi;
199   }
200 
201   /* Column indices are in the list of free space */
202   /* Allocate space for cj, initialize cj, and */
203   /* destroy list of free space and other temporary array(s) */
204   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
205   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
206   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
207 
208   /* put together the new symbolic matrix */
209   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
210   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
211 
212   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
213   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
214   c                         = (Mat_SeqAIJ*)(C->data);
215   c->free_a                 = PETSC_FALSE;
216   c->free_ij                = PETSC_TRUE;
217   c->nonew                  = 0;
218 
219   /* fast, needs non-scalable O(bn) array 'abdense' */
220   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
221 
222   /* set MatInfo */
223   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
224   if (afill < 1.0) afill = 1.0;
225   c->maxnz                     = ci[am];
226   c->nz                        = ci[am];
227   C->info.mallocs           = ndouble;
228   C->info.fill_ratio_given  = fill;
229   C->info.fill_ratio_needed = afill;
230 
231 #if defined(PETSC_USE_INFO)
232   if (ci[am]) {
233     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
234     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
235   } else {
236     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
237   }
238 #endif
239   PetscFunctionReturn(0);
240 }
241 
242 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
243 {
244   PetscErrorCode ierr;
245   PetscLogDouble flops=0.0;
246   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
247   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
248   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
249   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
250   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
251   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
252   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
253   PetscScalar    *ab_dense;
254 
255   PetscFunctionBegin;
256   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
257     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
258     c->a      = ca;
259     c->free_a = PETSC_TRUE;
260   } else {
261     ca        = c->a;
262   }
263   if (!c->matmult_abdense) {
264     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
265     c->matmult_abdense = ab_dense;
266   } else {
267     ab_dense = c->matmult_abdense;
268   }
269 
270   /* clean old values in C */
271   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
272   /* Traverse A row-wise. */
273   /* Build the ith row in C by summing over nonzero columns in A, */
274   /* the rows of B corresponding to nonzeros of A. */
275   for (i=0; i<am; i++) {
276     anzi = ai[i+1] - ai[i];
277     for (j=0; j<anzi; j++) {
278       brow = aj[j];
279       bnzi = bi[brow+1] - bi[brow];
280       bjj  = bj + bi[brow];
281       baj  = ba + bi[brow];
282       /* perform dense axpy */
283       valtmp = aa[j];
284       for (k=0; k<bnzi; k++) {
285         ab_dense[bjj[k]] += valtmp*baj[k];
286       }
287       flops += 2*bnzi;
288     }
289     aj += anzi; aa += anzi;
290 
291     cnzi = ci[i+1] - ci[i];
292     for (k=0; k<cnzi; k++) {
293       ca[k]          += ab_dense[cj[k]];
294       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
295     }
296     flops += cnzi;
297     cj    += cnzi; ca += cnzi;
298   }
299   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
300   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304 
305 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
306 {
307   PetscErrorCode ierr;
308   PetscLogDouble flops=0.0;
309   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
310   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
311   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
312   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
313   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
314   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
315   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
316   PetscInt       nextb;
317 
318   PetscFunctionBegin;
319   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
320     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
321     c->a      = ca;
322     c->free_a = PETSC_TRUE;
323   }
324 
325   /* clean old values in C */
326   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
327   /* Traverse A row-wise. */
328   /* Build the ith row in C by summing over nonzero columns in A, */
329   /* the rows of B corresponding to nonzeros of A. */
330   for (i=0; i<am; i++) {
331     anzi = ai[i+1] - ai[i];
332     cnzi = ci[i+1] - ci[i];
333     for (j=0; j<anzi; j++) {
334       brow = aj[j];
335       bnzi = bi[brow+1] - bi[brow];
336       bjj  = bj + bi[brow];
337       baj  = ba + bi[brow];
338       /* perform sparse axpy */
339       valtmp = aa[j];
340       nextb  = 0;
341       for (k=0; nextb<bnzi; k++) {
342         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
343           ca[k] += valtmp*baj[nextb++];
344         }
345       }
346       flops += 2*bnzi;
347     }
348     aj += anzi; aa += anzi;
349     cj += cnzi; ca += cnzi;
350   }
351 
352   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
353   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
354   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
355   PetscFunctionReturn(0);
356 }
357 
358 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
359 {
360   PetscErrorCode     ierr;
361   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
362   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
363   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
364   MatScalar          *ca;
365   PetscReal          afill;
366   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
367   PetscTable         ta;
368   PetscFreeSpaceList free_space=NULL,current_space=NULL;
369 
370   PetscFunctionBegin;
371   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
372   /*-----------------------------------------------------------------------------------------*/
373   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
374   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
375   ci[0] = 0;
376 
377   /* create and initialize a linked list */
378   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
379   MatRowMergeMax_SeqAIJ(b,bm,ta);
380   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
381   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
382 
383   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
384 
385   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
386   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
387   current_space = free_space;
388 
389   /* Determine ci and cj */
390   for (i=0; i<am; i++) {
391     anzi = ai[i+1] - ai[i];
392     aj   = a->j + ai[i];
393     for (j=0; j<anzi; j++) {
394       brow = aj[j];
395       bnzj = bi[brow+1] - bi[brow];
396       bj   = b->j + bi[brow];
397       /* add non-zero cols of B into the sorted linked list lnk */
398       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
399     }
400     cnzi = lnk[1];
401 
402     /* If free space is not available, make more free space */
403     /* Double the amount of total space in the list */
404     if (current_space->local_remaining<cnzi) {
405       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
406       ndouble++;
407     }
408 
409     /* Copy data into free space, then initialize lnk */
410     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
411 
412     current_space->array           += cnzi;
413     current_space->local_used      += cnzi;
414     current_space->local_remaining -= cnzi;
415 
416     ci[i+1] = ci[i] + cnzi;
417   }
418 
419   /* Column indices are in the list of free space */
420   /* Allocate space for cj, initialize cj, and */
421   /* destroy list of free space and other temporary array(s) */
422   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
423   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
424   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
425 
426   /* Allocate space for ca */
427   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
428 
429   /* put together the new symbolic matrix */
430   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
431   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
432 
433   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
434   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
435   c          = (Mat_SeqAIJ*)(C->data);
436   c->free_a  = PETSC_TRUE;
437   c->free_ij = PETSC_TRUE;
438   c->nonew   = 0;
439 
440   /* slower, less memory */
441   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
442 
443   /* set MatInfo */
444   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
445   if (afill < 1.0) afill = 1.0;
446   c->maxnz                     = ci[am];
447   c->nz                        = ci[am];
448   C->info.mallocs           = ndouble;
449   C->info.fill_ratio_given  = fill;
450   C->info.fill_ratio_needed = afill;
451 
452 #if defined(PETSC_USE_INFO)
453   if (ci[am]) {
454     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
455     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
456   } else {
457     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
458   }
459 #endif
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
464 {
465   PetscErrorCode     ierr;
466   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
467   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
468   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
469   MatScalar          *ca;
470   PetscReal          afill;
471   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
472   PetscTable         ta;
473   PetscFreeSpaceList free_space=NULL,current_space=NULL;
474 
475   PetscFunctionBegin;
476   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
477   /*---------------------------------------------------------------------------------------------*/
478   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
479   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
480   ci[0] = 0;
481 
482   /* create and initialize a linked list */
483   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
484   MatRowMergeMax_SeqAIJ(b,bm,ta);
485   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
486   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
487   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
488 
489   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
490   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
491   current_space = free_space;
492 
493   /* Determine ci and cj */
494   for (i=0; i<am; i++) {
495     anzi = ai[i+1] - ai[i];
496     aj   = a->j + ai[i];
497     for (j=0; j<anzi; j++) {
498       brow = aj[j];
499       bnzj = bi[brow+1] - bi[brow];
500       bj   = b->j + bi[brow];
501       /* add non-zero cols of B into the sorted linked list lnk */
502       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
503     }
504     cnzi = lnk[0];
505 
506     /* If free space is not available, make more free space */
507     /* Double the amount of total space in the list */
508     if (current_space->local_remaining<cnzi) {
509       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
510       ndouble++;
511     }
512 
513     /* Copy data into free space, then initialize lnk */
514     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
515 
516     current_space->array           += cnzi;
517     current_space->local_used      += cnzi;
518     current_space->local_remaining -= cnzi;
519 
520     ci[i+1] = ci[i] + cnzi;
521   }
522 
523   /* Column indices are in the list of free space */
524   /* Allocate space for cj, initialize cj, and */
525   /* destroy list of free space and other temporary array(s) */
526   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
527   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
528   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
529 
530   /* Allocate space for ca */
531   /*-----------------------*/
532   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
533 
534   /* put together the new symbolic matrix */
535   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
536   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
537 
538   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
539   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
540   c          = (Mat_SeqAIJ*)(C->data);
541   c->free_a  = PETSC_TRUE;
542   c->free_ij = PETSC_TRUE;
543   c->nonew   = 0;
544 
545   /* slower, less memory */
546   C->ops->matmultnumeric    = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
547 
548   /* set MatInfo */
549   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
550   if (afill < 1.0) afill = 1.0;
551   c->maxnz                     = ci[am];
552   c->nz                        = ci[am];
553   C->info.mallocs           = ndouble;
554   C->info.fill_ratio_given  = fill;
555   C->info.fill_ratio_needed = afill;
556 
557 #if defined(PETSC_USE_INFO)
558   if (ci[am]) {
559     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
560     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
561   } else {
562     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
563   }
564 #endif
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
569 {
570   PetscErrorCode     ierr;
571   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
572   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
573   PetscInt           *ci,*cj,*bb;
574   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
575   PetscReal          afill;
576   PetscInt           i,j,col,ndouble = 0;
577   PetscFreeSpaceList free_space=NULL,current_space=NULL;
578   PetscHeap          h;
579 
580   PetscFunctionBegin;
581   /* Get ci and cj - by merging sorted rows using a heap */
582   /*---------------------------------------------------------------------------------------------*/
583   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
584   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
585   ci[0] = 0;
586 
587   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
588   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
589   current_space = free_space;
590 
591   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
592   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
593 
594   /* Determine ci and cj */
595   for (i=0; i<am; i++) {
596     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
597     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
598     ci[i+1] = ci[i];
599     /* Populate the min heap */
600     for (j=0; j<anzi; j++) {
601       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
602       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
603         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
604       }
605     }
606     /* Pick off the min element, adding it to free space */
607     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
608     while (j >= 0) {
609       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
610         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
611         ndouble++;
612       }
613       *(current_space->array++) = col;
614       current_space->local_used++;
615       current_space->local_remaining--;
616       ci[i+1]++;
617 
618       /* stash if anything else remains in this row of B */
619       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
620       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
621         PetscInt j2,col2;
622         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
623         if (col2 != col) break;
624         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
625         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
626       }
627       /* Put any stashed elements back into the min heap */
628       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
629       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
630     }
631   }
632   ierr = PetscFree(bb);CHKERRQ(ierr);
633   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
634 
635   /* Column indices are in the list of free space */
636   /* Allocate space for cj, initialize cj, and */
637   /* destroy list of free space and other temporary array(s) */
638   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
639   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
640 
641   /* put together the new symbolic matrix */
642   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
643   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
644 
645   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
646   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
647   c          = (Mat_SeqAIJ*)(C->data);
648   c->free_a  = PETSC_TRUE;
649   c->free_ij = PETSC_TRUE;
650   c->nonew   = 0;
651 
652   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
653 
654   /* set MatInfo */
655   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
656   if (afill < 1.0) afill = 1.0;
657   c->maxnz                     = ci[am];
658   c->nz                        = ci[am];
659   C->info.mallocs           = ndouble;
660   C->info.fill_ratio_given  = fill;
661   C->info.fill_ratio_needed = afill;
662 
663 #if defined(PETSC_USE_INFO)
664   if (ci[am]) {
665     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
666     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
667   } else {
668     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
669   }
670 #endif
671   PetscFunctionReturn(0);
672 }
673 
674 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
675 {
676   PetscErrorCode     ierr;
677   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
678   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
679   PetscInt           *ci,*cj,*bb;
680   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
681   PetscReal          afill;
682   PetscInt           i,j,col,ndouble = 0;
683   PetscFreeSpaceList free_space=NULL,current_space=NULL;
684   PetscHeap          h;
685   PetscBT            bt;
686 
687   PetscFunctionBegin;
688   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
689   /*---------------------------------------------------------------------------------------------*/
690   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
691   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
692   ci[0] = 0;
693 
694   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
695   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
696 
697   current_space = free_space;
698 
699   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
700   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
701   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
702 
703   /* Determine ci and cj */
704   for (i=0; i<am; i++) {
705     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
706     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
707     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
708     ci[i+1] = ci[i];
709     /* Populate the min heap */
710     for (j=0; j<anzi; j++) {
711       PetscInt brow = acol[j];
712       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
713         PetscInt bcol = bj[bb[j]];
714         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
715           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
716           bb[j]++;
717           break;
718         }
719       }
720     }
721     /* Pick off the min element, adding it to free space */
722     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
723     while (j >= 0) {
724       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
725         fptr = NULL;                      /* need PetscBTMemzero */
726         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
727         ndouble++;
728       }
729       *(current_space->array++) = col;
730       current_space->local_used++;
731       current_space->local_remaining--;
732       ci[i+1]++;
733 
734       /* stash if anything else remains in this row of B */
735       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
736         PetscInt bcol = bj[bb[j]];
737         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
738           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
739           bb[j]++;
740           break;
741         }
742       }
743       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
744     }
745     if (fptr) {                 /* Clear the bits for this row */
746       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
747     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
748       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
749     }
750   }
751   ierr = PetscFree(bb);CHKERRQ(ierr);
752   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
753   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
754 
755   /* Column indices are in the list of free space */
756   /* Allocate space for cj, initialize cj, and */
757   /* destroy list of free space and other temporary array(s) */
758   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
759   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
760 
761   /* put together the new symbolic matrix */
762   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
763   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
764 
765   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
766   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
767   c          = (Mat_SeqAIJ*)(C->data);
768   c->free_a  = PETSC_TRUE;
769   c->free_ij = PETSC_TRUE;
770   c->nonew   = 0;
771 
772   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
773 
774   /* set MatInfo */
775   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
776   if (afill < 1.0) afill = 1.0;
777   c->maxnz                     = ci[am];
778   c->nz                        = ci[am];
779   C->info.mallocs           = ndouble;
780   C->info.fill_ratio_given  = fill;
781   C->info.fill_ratio_needed = afill;
782 
783 #if defined(PETSC_USE_INFO)
784   if (ci[am]) {
785     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
786     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
787   } else {
788     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
789   }
790 #endif
791   PetscFunctionReturn(0);
792 }
793 
794 
795 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
796 {
797   PetscErrorCode     ierr;
798   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
799   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
800   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
801   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
802   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
803   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
804   const PetscInt     *brow_ptr[8],*brow_end[8];
805   PetscInt           window[8];
806   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
807   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
808   PetscReal          afill;
809   PetscInt           *workj_L1,*workj_L2,*workj_L3;
810   PetscInt           L1_nnz,L2_nnz;
811 
812   /* Step 1: Get upper bound on memory required for allocation.
813              Because of the way virtual memory works,
814              only the memory pages that are actually needed will be physically allocated. */
815   PetscFunctionBegin;
816   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
817   for (i=0; i<am; i++) {
818     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
819     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
820     a_rownnz = 0;
821     for (k=0; k<anzi; ++k) {
822       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
823       if (a_rownnz > bn) {
824         a_rownnz = bn;
825         break;
826       }
827     }
828     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
829   }
830   /* temporary work areas for merging rows */
831   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
832   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
833   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
834 
835   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
836   c_maxmem = 8*(ai[am]+bi[bm]);
837   /* Step 2: Populate pattern for C */
838   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
839 
840   ci_nnz       = 0;
841   ci[0]        = 0;
842   worki_L1[0]  = 0;
843   worki_L2[0]  = 0;
844   for (i=0; i<am; i++) {
845     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
846     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
847     rowsleft             = anzi;
848     inputcol_L1          = acol;
849     L2_nnz               = 0;
850     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
851     worki_L2[1]          = 0;
852     outputi_nnz          = 0;
853 
854     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
855     while (ci_nnz+a_maxrownnz > c_maxmem) {
856       c_maxmem *= 2;
857       ndouble++;
858       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
859     }
860 
861     while (rowsleft) {
862       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
863       L1_nrows    = 0;
864       L1_nnz      = 0;
865       inputcol    = inputcol_L1;
866       inputi      = bi;
867       inputj      = bj;
868 
869       /* The following macro is used to specialize for small rows in A.
870          This helps with compiler unrolling, improving performance substantially.
871           Input:  inputj   inputi  inputcol  bn
872           Output: outputj  outputi_nnz                       */
873        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
874          window_min  = bn;                                                   \
875          outputi_nnz = 0;                                                    \
876          for (k=0; k<ANNZ; ++k) {                                            \
877            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
878            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
879            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
880            window_min  = PetscMin(window[k], window_min);                    \
881          }                                                                   \
882          while (window_min < bn) {                                           \
883            outputj[outputi_nnz++] = window_min;                              \
884            /* advance front and compute new minimum */                       \
885            old_window_min = window_min;                                      \
886            window_min = bn;                                                  \
887            for (k=0; k<ANNZ; ++k) {                                          \
888              if (window[k] == old_window_min) {                              \
889                brow_ptr[k]++;                                                \
890                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
891              }                                                               \
892              window_min = PetscMin(window[k], window_min);                   \
893            }                                                                 \
894          }
895 
896       /************** L E V E L  1 ***************/
897       /* Merge up to 8 rows of B to L1 work array*/
898       while (L1_rowsleft) {
899         outputi_nnz = 0;
900         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
901         else           outputj = cj + ci_nnz;           /* Merge directly to C */
902 
903         switch (L1_rowsleft) {
904         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
905                  brow_end[0] = inputj + inputi[inputcol[0]+1];
906                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
907                  inputcol    += L1_rowsleft;
908                  rowsleft    -= L1_rowsleft;
909                  L1_rowsleft  = 0;
910                  break;
911         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
912                  inputcol    += L1_rowsleft;
913                  rowsleft    -= L1_rowsleft;
914                  L1_rowsleft  = 0;
915                  break;
916         case 3: MatMatMultSymbolic_RowMergeMacro(3);
917                  inputcol    += L1_rowsleft;
918                  rowsleft    -= L1_rowsleft;
919                  L1_rowsleft  = 0;
920                  break;
921         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
922                  inputcol    += L1_rowsleft;
923                  rowsleft    -= L1_rowsleft;
924                  L1_rowsleft  = 0;
925                  break;
926         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
927                  inputcol    += L1_rowsleft;
928                  rowsleft    -= L1_rowsleft;
929                  L1_rowsleft  = 0;
930                  break;
931         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
932                  inputcol    += L1_rowsleft;
933                  rowsleft    -= L1_rowsleft;
934                  L1_rowsleft  = 0;
935                  break;
936         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
937                  inputcol    += L1_rowsleft;
938                  rowsleft    -= L1_rowsleft;
939                  L1_rowsleft  = 0;
940                  break;
941         default: MatMatMultSymbolic_RowMergeMacro(8);
942                  inputcol    += 8;
943                  rowsleft    -= 8;
944                  L1_rowsleft -= 8;
945                  break;
946         }
947         inputcol_L1           = inputcol;
948         L1_nnz               += outputi_nnz;
949         worki_L1[++L1_nrows]  = L1_nnz;
950       }
951 
952       /********************** L E V E L  2 ************************/
953       /* Merge from L1 work array to either C or to L2 work array */
954       if (anzi > 8) {
955         inputi      = worki_L1;
956         inputj      = workj_L1;
957         inputcol    = workcol;
958         outputi_nnz = 0;
959 
960         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
961         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
962 
963         switch (L1_nrows) {
964         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
965                  brow_end[0] = inputj + inputi[inputcol[0]+1];
966                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
967                  break;
968         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
969         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
970         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
971         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
972         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
973         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
974         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
975         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
976         }
977         L2_nnz               += outputi_nnz;
978         worki_L2[++L2_nrows]  = L2_nnz;
979 
980         /************************ L E V E L  3 **********************/
981         /* Merge from L2 work array to either C or to L2 work array */
982         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
983           inputi      = worki_L2;
984           inputj      = workj_L2;
985           inputcol    = workcol;
986           outputi_nnz = 0;
987           if (rowsleft) outputj = workj_L3;
988           else          outputj = cj + ci_nnz;
989           switch (L2_nrows) {
990           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
991                    brow_end[0] = inputj + inputi[inputcol[0]+1];
992                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
993                    break;
994           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
995           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
996           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
997           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
998           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
999           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1000           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1001           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1002           }
1003           L2_nrows    = 1;
1004           L2_nnz      = outputi_nnz;
1005           worki_L2[1] = outputi_nnz;
1006           /* Copy to workj_L2 */
1007           if (rowsleft) {
1008             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1009           }
1010         }
1011       }
1012     }  /* while (rowsleft) */
1013 #undef MatMatMultSymbolic_RowMergeMacro
1014 
1015     /* terminate current row */
1016     ci_nnz += outputi_nnz;
1017     ci[i+1] = ci_nnz;
1018   }
1019 
1020   /* Step 3: Create the new symbolic matrix */
1021   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1022   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1023 
1024   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1025   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1026   c          = (Mat_SeqAIJ*)(C->data);
1027   c->free_a  = PETSC_TRUE;
1028   c->free_ij = PETSC_TRUE;
1029   c->nonew   = 0;
1030 
1031   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1032 
1033   /* set MatInfo */
1034   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1035   if (afill < 1.0) afill = 1.0;
1036   c->maxnz                     = ci[am];
1037   c->nz                        = ci[am];
1038   C->info.mallocs           = ndouble;
1039   C->info.fill_ratio_given  = fill;
1040   C->info.fill_ratio_needed = afill;
1041 
1042 #if defined(PETSC_USE_INFO)
1043   if (ci[am]) {
1044     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1045     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1046   } else {
1047     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1048   }
1049 #endif
1050 
1051   /* Step 4: Free temporary work areas */
1052   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1053   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1054   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 /* concatenate unique entries and then sort */
1059 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1060 {
1061   PetscErrorCode     ierr;
1062   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1063   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1064   PetscInt           *ci,*cj;
1065   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1066   PetscReal          afill;
1067   PetscInt           i,j,ndouble = 0;
1068   PetscSegBuffer     seg,segrow;
1069   char               *seen;
1070 
1071   PetscFunctionBegin;
1072   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1073   ci[0] = 0;
1074 
1075   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1076   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1077   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1078   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1079 
1080   /* Determine ci and cj */
1081   for (i=0; i<am; i++) {
1082     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1083     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1084     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1085     /* Pack segrow */
1086     for (j=0; j<anzi; j++) {
1087       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1088       for (k=bjstart; k<bjend; k++) {
1089         PetscInt bcol = bj[k];
1090         if (!seen[bcol]) { /* new entry */
1091           PetscInt *PETSC_RESTRICT slot;
1092           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1093           *slot = bcol;
1094           seen[bcol] = 1;
1095           packlen++;
1096         }
1097       }
1098     }
1099     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1100     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1101     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1102     ci[i+1] = ci[i] + packlen;
1103     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1104   }
1105   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1106   ierr = PetscFree(seen);CHKERRQ(ierr);
1107 
1108   /* Column indices are in the segmented buffer */
1109   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1110   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1111 
1112   /* put together the new symbolic matrix */
1113   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1114   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1115 
1116   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1117   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1118   c          = (Mat_SeqAIJ*)(C->data);
1119   c->free_a  = PETSC_TRUE;
1120   c->free_ij = PETSC_TRUE;
1121   c->nonew   = 0;
1122 
1123   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1124 
1125   /* set MatInfo */
1126   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1127   if (afill < 1.0) afill = 1.0;
1128   c->maxnz                     = ci[am];
1129   c->nz                        = ci[am];
1130   C->info.mallocs           = ndouble;
1131   C->info.fill_ratio_given  = fill;
1132   C->info.fill_ratio_needed = afill;
1133 
1134 #if defined(PETSC_USE_INFO)
1135   if (ci[am]) {
1136     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1137     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1138   } else {
1139     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1140   }
1141 #endif
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
1146 {
1147   PetscErrorCode      ierr;
1148   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
1149   Mat_MatMatTransMult *abt=a->abt;
1150 
1151   PetscFunctionBegin;
1152   ierr = (abt->destroy)(A);CHKERRQ(ierr);
1153   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
1154   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
1155   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
1156   ierr = PetscFree(abt);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1161 {
1162   PetscErrorCode      ierr;
1163   Mat                 Bt;
1164   PetscInt            *bti,*btj;
1165   Mat_MatMatTransMult *abt;
1166   Mat_SeqAIJ          *c;
1167   Mat_Product         *product = C->product;
1168   MatProductAlgorithm alg = product->alg;
1169 
1170   PetscFunctionBegin;
1171   /* create symbolic Bt */
1172   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1173   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
1174   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1175   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
1176 
1177   /* get symbolic C=A*Bt */
1178   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
1179   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
1180   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
1181 
1182   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1183   ierr   = PetscNew(&abt);CHKERRQ(ierr);
1184   c      = (Mat_SeqAIJ*)C->data;
1185   c->abt = abt;
1186 
1187   abt->usecoloring = PETSC_FALSE;
1188   abt->destroy     = C->ops->destroy;
1189   C->ops->destroy  = MatDestroy_SeqAIJ_MatMatMultTrans;
1190   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1191 
1192   abt->usecoloring = PETSC_FALSE;
1193   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
1194   if (abt->usecoloring) {
1195     /* Create MatTransposeColoring from symbolic C=A*B^T */
1196     MatTransposeColoring matcoloring;
1197     MatColoring          coloring;
1198     ISColoring           iscoloring;
1199     Mat                  Bt_dense,C_dense;
1200 
1201     /* inode causes memory problem */
1202     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
1203 
1204     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1205     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1206     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1207     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1208     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1209     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1210     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
1211 
1212     abt->matcoloring = matcoloring;
1213 
1214     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
1215 
1216     /* Create Bt_dense and C_dense = A*Bt_dense */
1217     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
1218     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1219     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
1220     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
1221 
1222     Bt_dense->assembled = PETSC_TRUE;
1223     abt->Bt_den         = Bt_dense;
1224 
1225     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
1226     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1227     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
1228     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
1229 
1230     Bt_dense->assembled = PETSC_TRUE;
1231     abt->ABt_den  = C_dense;
1232 
1233 #if defined(PETSC_USE_INFO)
1234     {
1235       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1236       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
1237     }
1238 #endif
1239   }
1240   /* clean up */
1241   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1242   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1247 {
1248   PetscErrorCode      ierr;
1249   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1250   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1251   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1252   PetscLogDouble      flops=0.0;
1253   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1254   Mat_MatMatTransMult *abt = c->abt;
1255 
1256   PetscFunctionBegin;
1257   /* clear old values in C */
1258   if (!c->a) {
1259     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1260     c->a      = ca;
1261     c->free_a = PETSC_TRUE;
1262   } else {
1263     ca =  c->a;
1264     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
1265   }
1266 
1267   if (abt->usecoloring) {
1268     MatTransposeColoring matcoloring = abt->matcoloring;
1269     Mat                  Bt_dense,C_dense = abt->ABt_den;
1270 
1271     /* Get Bt_dense by Apply MatTransposeColoring to B */
1272     Bt_dense = abt->Bt_den;
1273     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1274 
1275     /* C_dense = A*Bt_dense */
1276     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1277 
1278     /* Recover C from C_dense */
1279     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1280     PetscFunctionReturn(0);
1281   }
1282 
1283   for (i=0; i<cm; i++) {
1284     anzi = ai[i+1] - ai[i];
1285     acol = aj + ai[i];
1286     aval = aa + ai[i];
1287     cnzi = ci[i+1] - ci[i];
1288     ccol = cj + ci[i];
1289     cval = ca + ci[i];
1290     for (j=0; j<cnzi; j++) {
1291       brow = ccol[j];
1292       bnzj = bi[brow+1] - bi[brow];
1293       bcol = bj + bi[brow];
1294       bval = ba + bi[brow];
1295 
1296       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1297       nexta = 0; nextb = 0;
1298       while (nexta<anzi && nextb<bnzj) {
1299         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1300         if (nexta == anzi) break;
1301         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1302         if (nextb == bnzj) break;
1303         if (acol[nexta] == bcol[nextb]) {
1304           cval[j] += aval[nexta]*bval[nextb];
1305           nexta++; nextb++;
1306           flops += 2;
1307         }
1308       }
1309     }
1310   }
1311   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1312   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1313   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
1318 {
1319   PetscErrorCode      ierr;
1320   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
1321   Mat_MatTransMatMult *atb = a->atb;
1322 
1323   PetscFunctionBegin;
1324   if (atb) {
1325     ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
1326     ierr = (*atb->destroy)(A);CHKERRQ(ierr);
1327   }
1328   ierr = PetscFree(atb);CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1333 {
1334   PetscErrorCode      ierr;
1335   Mat                 At;
1336   PetscInt            *ati,*atj;
1337   Mat_Product         *product = C->product;
1338   MatProductAlgorithm alg;
1339   PetscBool           flg;
1340 
1341   PetscFunctionBegin;
1342   if (product) {
1343     alg = product->alg;
1344   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
1345 
1346   /* outerproduct */
1347   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
1348   if (flg) {
1349     /* create symbolic At */
1350     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1351     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1352     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1353     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1354 
1355     /* get symbolic C=At*B */
1356     product->alg = "sorted";
1357     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1358 
1359     /* clean up */
1360     ierr = MatDestroy(&At);CHKERRQ(ierr);
1361     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1362 
1363     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1364     PetscFunctionReturn(0);
1365   }
1366 
1367   /* matmatmult */
1368   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
1369   if (flg) {
1370     Mat_MatTransMatMult *atb;
1371     Mat_SeqAIJ          *c;
1372 
1373     ierr = PetscNew(&atb);CHKERRQ(ierr);
1374     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1375     product->alg = "sorted";
1376     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1377 
1378     c               = (Mat_SeqAIJ*)C->data;
1379     c->atb          = atb;
1380     atb->At         = At;
1381     atb->destroy    = C->ops->destroy;
1382     atb->updateAt   = PETSC_FALSE; /* because At is computed here */
1383     C->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1384 
1385     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1386     PetscFunctionReturn(0);
1387   }
1388 
1389   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1394 {
1395   PetscErrorCode ierr;
1396   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1397   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1398   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1399   PetscLogDouble flops=0.0;
1400   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1401 
1402   PetscFunctionBegin;
1403   if (!c->a) {
1404     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1405 
1406     c->a      = ca;
1407     c->free_a = PETSC_TRUE;
1408   } else {
1409     ca   = c->a;
1410     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1411   }
1412 
1413   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1414   for (i=0; i<am; i++) {
1415     bj   = b->j + bi[i];
1416     ba   = b->a + bi[i];
1417     bnzi = bi[i+1] - bi[i];
1418     anzi = ai[i+1] - ai[i];
1419     for (j=0; j<anzi; j++) {
1420       nextb = 0;
1421       crow  = *aj++;
1422       cjj   = cj + ci[crow];
1423       caj   = ca + ci[crow];
1424       /* perform sparse axpy operation.  Note cjj includes bj. */
1425       for (k=0; nextb<bnzi; k++) {
1426         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1427           caj[k] += (*aa)*(*(ba+nextb));
1428           nextb++;
1429         }
1430       }
1431       flops += 2*bnzi;
1432       aa++;
1433     }
1434   }
1435 
1436   /* Assemble the final matrix and clean up */
1437   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1438   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1439   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1444 {
1445   PetscErrorCode ierr;
1446 
1447   PetscFunctionBegin;
1448   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1449 
1450   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1455 {
1456   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1457   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1458   PetscErrorCode    ierr;
1459   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1460   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1461   const PetscInt    *aj;
1462   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1463   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1464 
1465   PetscFunctionBegin;
1466   if (!cm || !cn) PetscFunctionReturn(0);
1467   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
1468   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1469   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1470   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1471   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1472   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1473     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1474       r1 = r2 = r3 = r4 = 0.0;
1475       n  = a->i[i+1] - a->i[i];
1476       aj = a->j + a->i[i];
1477       aa = av + a->i[i];
1478       for (j=0; j<n; j++) {
1479         aatmp = aa[j]; ajtmp = aj[j];
1480         r1 += aatmp*b1[ajtmp];
1481         r2 += aatmp*b2[ajtmp];
1482         r3 += aatmp*b3[ajtmp];
1483         r4 += aatmp*b4[ajtmp];
1484       }
1485       c1[i] += r1;
1486       c2[i] += r2;
1487       c3[i] += r3;
1488       c4[i] += r4;
1489     }
1490     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1491     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1492   }
1493   for (; col<cn; col++) {   /* over extra columns of C */
1494     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1495       r1 = 0.0;
1496       n  = a->i[i+1] - a->i[i];
1497       aj = a->j + a->i[i];
1498       aa = av + a->i[i];
1499       for (j=0; j<n; j++) {
1500         r1 += aa[j]*b1[aj[j]];
1501       }
1502       c1[i] += r1;
1503     }
1504     b1 += bm;
1505     c1 += am;
1506   }
1507   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1508   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1509   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1510   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1511   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1512   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1517 {
1518   PetscErrorCode ierr;
1519 
1520   PetscFunctionBegin;
1521   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1522   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1523   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1524 
1525   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1526   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 /* ------------------------------------------------------- */
1531 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1532 {
1533   PetscFunctionBegin;
1534   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1535   C->ops->productsymbolic = MatProductSymbolic_AB;
1536   /* dense mat may not call MatProductSymbolic(), thus set C->ops->productnumeric here */
1537   C->ops->productnumeric  = MatProductNumeric_AB;
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1542 {
1543   PetscFunctionBegin;
1544   C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqAIJ_SeqDense;
1545   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1546   C->ops->productnumeric           = MatProductNumeric_AtB;
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1551 {
1552   PetscErrorCode ierr;
1553   Mat_Product    *product = C->product;
1554 
1555   PetscFunctionBegin;
1556   switch (product->type) {
1557   case MATPRODUCT_AB:
1558     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1559     break;
1560   case MATPRODUCT_AtB:
1561     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
1562     break;
1563   case MATPRODUCT_PtAP:
1564     ierr = MatProductSetFromOptions_SeqDense(C);CHKERRQ(ierr);
1565     break;
1566   default:
1567     /* Use MatProduct_Basic() if there is no specific implementation */
1568     C->ops->productsymbolic = MatProductSymbolic_Basic;
1569   }
1570   PetscFunctionReturn(0);
1571 }
1572 /* ------------------------------------------------------- */
1573 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1574 {
1575   PetscErrorCode ierr;
1576   Mat_Product    *product = C->product;
1577   Mat            A = product->A;
1578   PetscBool      baij;
1579 
1580   PetscFunctionBegin;
1581   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
1582   if (!baij) { /* A is seqsbaij */
1583     PetscBool sbaij;
1584     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
1585     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1586 
1587     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1588   } else { /* A is seqbaij */
1589     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1590   }
1591 
1592   C->ops->productsymbolic = MatProductSymbolic_AB;
1593   C->ops->productnumeric  = MatProductNumeric_AB;
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1598 {
1599   PetscErrorCode ierr;
1600   Mat_Product    *product = C->product;
1601 
1602   PetscFunctionBegin;
1603   if (product->type == MATPRODUCT_AB) {
1604     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1605   } else SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqXBAIJ and SeqDense matrices",MatProductTypes[product->type]);
1606   PetscFunctionReturn(0);
1607 }
1608 /* ------------------------------------------------------- */
1609 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1610 {
1611   PetscFunctionBegin;
1612   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1613   C->ops->productsymbolic = MatProductSymbolic_AB;
1614   C->ops->productnumeric  = MatProductNumeric_AB;
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1619 {
1620   PetscErrorCode ierr;
1621   Mat_Product    *product = C->product;
1622 
1623   PetscFunctionBegin;
1624   if (product->type == MATPRODUCT_AB) {
1625     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
1626   } else SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqDense and SeqAIJ matrices",MatProductTypes[product->type]);
1627   PetscFunctionReturn(0);
1628 }
1629 /* ------------------------------------------------------- */
1630 
1631 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1632 {
1633   PetscErrorCode ierr;
1634   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1635   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1636   PetscInt       *bi      = b->i,*bj=b->j;
1637   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1638   MatScalar      *btval,*btval_den,*ba=b->a;
1639   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1640 
1641   PetscFunctionBegin;
1642   btval_den=btdense->v;
1643   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
1644   for (k=0; k<ncolors; k++) {
1645     ncolumns = coloring->ncolumns[k];
1646     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1647       col   = *(columns + colorforcol[k] + l);
1648       btcol = bj + bi[col];
1649       btval = ba + bi[col];
1650       anz   = bi[col+1] - bi[col];
1651       for (j=0; j<anz; j++) {
1652         brow            = btcol[j];
1653         btval_den[brow] = btval[j];
1654       }
1655     }
1656     btval_den += m;
1657   }
1658   PetscFunctionReturn(0);
1659 }
1660 
1661 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1662 {
1663   PetscErrorCode    ierr;
1664   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1665   const PetscScalar *ca_den,*ca_den_ptr;
1666   PetscScalar       *ca=csp->a;
1667   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1668   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1669   PetscInt          nrows,*row,*idx;
1670   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1671 
1672   PetscFunctionBegin;
1673   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1674 
1675   if (brows > 0) {
1676     PetscInt *lstart,row_end,row_start;
1677     lstart = matcoloring->lstart;
1678     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1679 
1680     row_end = brows;
1681     if (row_end > m) row_end = m;
1682     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1683       ca_den_ptr = ca_den;
1684       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1685         nrows = matcoloring->nrows[k];
1686         row   = rows  + colorforrow[k];
1687         idx   = den2sp + colorforrow[k];
1688         for (l=lstart[k]; l<nrows; l++) {
1689           if (row[l] >= row_end) {
1690             lstart[k] = l;
1691             break;
1692           } else {
1693             ca[idx[l]] = ca_den_ptr[row[l]];
1694           }
1695         }
1696         ca_den_ptr += m;
1697       }
1698       row_end += brows;
1699       if (row_end > m) row_end = m;
1700     }
1701   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1702     ca_den_ptr = ca_den;
1703     for (k=0; k<ncolors; k++) {
1704       nrows = matcoloring->nrows[k];
1705       row   = rows  + colorforrow[k];
1706       idx   = den2sp + colorforrow[k];
1707       for (l=0; l<nrows; l++) {
1708         ca[idx[l]] = ca_den_ptr[row[l]];
1709       }
1710       ca_den_ptr += m;
1711     }
1712   }
1713 
1714   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1715 #if defined(PETSC_USE_INFO)
1716   if (matcoloring->brows > 0) {
1717     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1718   } else {
1719     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1720   }
1721 #endif
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1726 {
1727   PetscErrorCode ierr;
1728   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1729   const PetscInt *is,*ci,*cj,*row_idx;
1730   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1731   IS             *isa;
1732   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1733   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1734   PetscInt       *colorforcol,*columns,*columns_i,brows;
1735   PetscBool      flg;
1736 
1737   PetscFunctionBegin;
1738   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1739 
1740   /* bs >1 is not being tested yet! */
1741   Nbs       = mat->cmap->N/bs;
1742   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1743   c->N      = Nbs;
1744   c->m      = c->M;
1745   c->rstart = 0;
1746   c->brows  = 100;
1747 
1748   c->ncolors = nis;
1749   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1750   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1751   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1752 
1753   brows = c->brows;
1754   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1755   if (flg) c->brows = brows;
1756   if (brows > 0) {
1757     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1758   }
1759 
1760   colorforrow[0] = 0;
1761   rows_i         = rows;
1762   den2sp_i       = den2sp;
1763 
1764   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1765   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
1766 
1767   colorforcol[0] = 0;
1768   columns_i      = columns;
1769 
1770   /* get column-wise storage of mat */
1771   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1772 
1773   cm   = c->m;
1774   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1775   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1776   for (i=0; i<nis; i++) { /* loop over color */
1777     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1778     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1779 
1780     c->ncolumns[i] = n;
1781     if (n) {
1782       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1783     }
1784     colorforcol[i+1] = colorforcol[i] + n;
1785     columns_i       += n;
1786 
1787     /* fast, crude version requires O(N*N) work */
1788     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1789 
1790     for (j=0; j<n; j++) { /* loop over columns*/
1791       col     = is[j];
1792       row_idx = cj + ci[col];
1793       m       = ci[col+1] - ci[col];
1794       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1795         idxhit[*row_idx]   = spidx[ci[col] + k];
1796         rowhit[*row_idx++] = col + 1;
1797       }
1798     }
1799     /* count the number of hits */
1800     nrows = 0;
1801     for (j=0; j<cm; j++) {
1802       if (rowhit[j]) nrows++;
1803     }
1804     c->nrows[i]      = nrows;
1805     colorforrow[i+1] = colorforrow[i] + nrows;
1806 
1807     nrows = 0;
1808     for (j=0; j<cm; j++) { /* loop over rows */
1809       if (rowhit[j]) {
1810         rows_i[nrows]   = j;
1811         den2sp_i[nrows] = idxhit[j];
1812         nrows++;
1813       }
1814     }
1815     den2sp_i += nrows;
1816 
1817     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1818     rows_i += nrows;
1819   }
1820   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1821   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1822   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1823   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1824 
1825   c->colorforrow = colorforrow;
1826   c->rows        = rows;
1827   c->den2sp      = den2sp;
1828   c->colorforcol = colorforcol;
1829   c->columns     = columns;
1830 
1831   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 /* --------------------------------------------------------------- */
1836 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1837 {
1838   PetscErrorCode ierr;
1839   Mat_Product    *product = C->product;
1840   Mat            A=product->A,B=product->B;
1841 
1842   PetscFunctionBegin;
1843   if (C->ops->mattransposemultnumeric) {
1844     /* Alg: "outerproduct" */
1845     ierr = (C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
1846   } else {
1847     /* Alg: "matmatmult" -- C = At*B */
1848     Mat_SeqAIJ          *c = (Mat_SeqAIJ*)C->data;
1849     Mat_MatTransMatMult *atb = c->atb;
1850     Mat                 At = atb->At;
1851 
1852     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1853       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
1854     }
1855     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
1856     atb->updateAt = PETSC_TRUE;
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1862 {
1863   PetscErrorCode ierr;
1864   Mat_Product    *product = C->product;
1865   Mat            A=product->A,B=product->B;
1866   PetscReal      fill=product->fill;
1867 
1868   PetscFunctionBegin;
1869   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1870 
1871   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 /* --------------------------------------------------------------- */
1876 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1877 {
1878   PetscErrorCode ierr;
1879   Mat_Product    *product = C->product;
1880   PetscInt       alg = 0; /* default algorithm */
1881   PetscBool      flg = PETSC_FALSE;
1882 #if !defined(PETSC_HAVE_HYPRE)
1883   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1884   PetscInt       nalg = 7;
1885 #else
1886   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1887   PetscInt       nalg = 8;
1888 #endif
1889 
1890   PetscFunctionBegin;
1891   /* Set default algorithm */
1892   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1893   if (flg) {
1894     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1895   }
1896 
1897   /* Get runtime option */
1898   if (product->api_user) {
1899     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1900     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1901     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1902   } else {
1903     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1904     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1905     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1906   }
1907   if (flg) {
1908     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1909   }
1910 
1911   C->ops->productsymbolic = MatProductSymbolic_AB;
1912   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1917 {
1918   PetscErrorCode ierr;
1919   Mat_Product    *product = C->product;
1920   PetscInt       alg = 0; /* default algorithm */
1921   PetscBool      flg = PETSC_FALSE;
1922   const char     *algTypes[2] = {"at*b","outerproduct"};
1923   PetscInt       nalg = 2;
1924 
1925   PetscFunctionBegin;
1926   /* Set default algorithm */
1927   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
1928   if (flg) {
1929     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1930   }
1931 
1932   /* Get runtime option */
1933   if (product->api_user) {
1934     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1935     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1936     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1937   } else {
1938     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1939     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1940     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1941   }
1942   if (flg) {
1943     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1944   }
1945 
1946   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
1951 {
1952   PetscErrorCode ierr;
1953   Mat_Product    *product = C->product;
1954   PetscInt       alg = 0; /* default algorithm */
1955   PetscBool      flg = PETSC_FALSE;
1956   const char     *algTypes[2] = {"default","color"};
1957   PetscInt       nalg = 2;
1958 
1959   PetscFunctionBegin;
1960   /* Set default algorithm */
1961   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1962   if (!flg) {
1963     alg = 1;
1964     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1965   }
1966 
1967   /* Get runtime option */
1968   if (product->api_user) {
1969     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
1970     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1971     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1972   } else {
1973     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
1974     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1975     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1976   }
1977   if (flg) {
1978     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1979   }
1980 
1981   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
1982   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1983   PetscFunctionReturn(0);
1984 }
1985 
1986 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
1987 {
1988   PetscErrorCode ierr;
1989   Mat_Product    *product = C->product;
1990   PetscBool      flg = PETSC_FALSE;
1991   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
1992 #if !defined(PETSC_HAVE_HYPRE)
1993   const char      *algTypes[2] = {"scalable","rap"};
1994   PetscInt        nalg = 2;
1995 #else
1996   const char      *algTypes[3] = {"scalable","rap","hypre"};
1997   PetscInt        nalg = 3;
1998 #endif
1999 
2000   PetscFunctionBegin;
2001   /* Set default algorithm */
2002   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2003   if (flg) {
2004     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2005   }
2006 
2007   /* Get runtime option */
2008   if (product->api_user) {
2009     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2010     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2011     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2012   } else {
2013     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2014     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2015     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2016   }
2017   if (flg) {
2018     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2019   }
2020 
2021   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2026 {
2027   PetscErrorCode ierr;
2028   Mat_Product    *product = C->product;
2029   PetscBool      flg = PETSC_FALSE;
2030   PetscInt       alg = 0; /* default algorithm */
2031   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2032   PetscInt        nalg = 3;
2033 
2034   PetscFunctionBegin;
2035   /* Set default algorithm */
2036   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2037   if (flg) {
2038     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2039   }
2040 
2041   /* Get runtime option */
2042   if (product->api_user) {
2043     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2044     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2045     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2046   } else {
2047     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2048     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2049     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2050   }
2051   if (flg) {
2052     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2053   }
2054 
2055   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2060 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2061 {
2062   PetscErrorCode ierr;
2063   Mat_Product    *product = C->product;
2064   PetscInt       alg = 0; /* default algorithm */
2065   PetscBool      flg = PETSC_FALSE;
2066   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2067   PetscInt       nalg = 7;
2068 
2069   PetscFunctionBegin;
2070   /* Set default algorithm */
2071   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2072   if (flg) {
2073     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2074   }
2075 
2076   /* Get runtime option */
2077   if (product->api_user) {
2078     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2079     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2080     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2081   } else {
2082     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2083     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2084     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2085   }
2086   if (flg) {
2087     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2088   }
2089 
2090   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2091   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2092   PetscFunctionReturn(0);
2093 }
2094 
2095 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2096 {
2097   PetscErrorCode ierr;
2098   Mat_Product    *product = C->product;
2099 
2100   PetscFunctionBegin;
2101   switch (product->type) {
2102   case MATPRODUCT_AB:
2103     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
2104     break;
2105   case MATPRODUCT_AtB:
2106     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
2107     break;
2108   case MATPRODUCT_ABt:
2109     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
2110     break;
2111   case MATPRODUCT_PtAP:
2112     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
2113     break;
2114   case MATPRODUCT_RARt:
2115     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
2116     break;
2117   case MATPRODUCT_ABC:
2118     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
2119     break;
2120   default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqAIJ and SeqAIJ matrices",MatProductTypes[product->type]);
2121   }
2122   PetscFunctionReturn(0);
2123 }
2124