xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 3d392a07bc8a9be20cde8f5732e8bb1887055e16)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscErrorCode ierr;
16 
17   PetscFunctionBegin;
18   if (C->ops->matmultnumeric) {
19     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
20   } else {
21     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
22   }
23   PetscFunctionReturn(0);
24 }
25 
26 /* Modified from MatCreateSeqAIJWithArrays() */
27 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
28 {
29   PetscErrorCode ierr;
30   PetscInt       ii;
31   Mat_SeqAIJ     *aij;
32   PetscBool      isseqaij;
33 
34   PetscFunctionBegin;
35   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
36   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
37 
38   if (!mtype) {
39     ierr = PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
40     if (!isseqaij) { ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr); }
41   } else {
42     ierr = MatSetType(mat,mtype);CHKERRQ(ierr);
43   }
44   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,0);CHKERRQ(ierr);
45   aij  = (Mat_SeqAIJ*)(mat)->data;
46   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
47   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
48 
49   aij->i            = i;
50   aij->j            = j;
51   aij->a            = a;
52   aij->singlemalloc = PETSC_FALSE;
53   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
54   aij->free_a       = PETSC_FALSE;
55   aij->free_ij      = PETSC_FALSE;
56 
57   for (ii=0; ii<m; ii++) {
58     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
59   }
60 
61   PetscFunctionReturn(0);
62 }
63 
64 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
65 {
66   PetscErrorCode      ierr;
67   Mat_Product         *product = C->product;
68   MatProductAlgorithm alg;
69   PetscBool           flg;
70 
71   PetscFunctionBegin;
72   if (product) {
73     alg = product->alg;
74   } else {
75     alg = "sorted";
76   }
77   /* sorted */
78   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
79   if (flg) {
80     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
81     PetscFunctionReturn(0);
82   }
83 
84   /* scalable */
85   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
86   if (flg) {
87     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
88     PetscFunctionReturn(0);
89   }
90 
91   /* scalable_fast */
92   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
93   if (flg) {
94     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
95     PetscFunctionReturn(0);
96   }
97 
98   /* heap */
99   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
100   if (flg) {
101     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
102     PetscFunctionReturn(0);
103   }
104 
105   /* btheap */
106   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
107   if (flg) {
108     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
109     PetscFunctionReturn(0);
110   }
111 
112   /* llcondensed */
113   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
114   if (flg) {
115     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
116     PetscFunctionReturn(0);
117   }
118 
119   /* rowmerge */
120   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
121   if (flg) {
122     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
123     PetscFunctionReturn(0);
124   }
125 
126 #if defined(PETSC_HAVE_HYPRE)
127   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
128   if (flg) {
129     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
130     PetscFunctionReturn(0);
131   }
132 #endif
133 
134   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
135   PetscFunctionReturn(0);
136 }
137 
138 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
139 {
140   PetscErrorCode     ierr;
141   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
142   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
143   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
144   PetscReal          afill;
145   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
146   PetscTable         ta;
147   PetscBT            lnkbt;
148   PetscFreeSpaceList free_space=NULL,current_space=NULL;
149 
150   PetscFunctionBegin;
151   /* Get ci and cj */
152   /*---------------*/
153   /* Allocate ci array, arrays for fill computation and */
154   /* free space for accumulating nonzero column info */
155   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
156   ci[0] = 0;
157 
158   /* create and initialize a linked list */
159   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
160   MatRowMergeMax_SeqAIJ(b,bm,ta);
161   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
162   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
163 
164   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
165 
166   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
167   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
168 
169   current_space = free_space;
170 
171   /* Determine ci and cj */
172   for (i=0; i<am; i++) {
173     anzi = ai[i+1] - ai[i];
174     aj   = a->j + ai[i];
175     for (j=0; j<anzi; j++) {
176       brow = aj[j];
177       bnzj = bi[brow+1] - bi[brow];
178       bj   = b->j + bi[brow];
179       /* add non-zero cols of B into the sorted linked list lnk */
180       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
181     }
182     cnzi = lnk[0];
183 
184     /* If free space is not available, make more free space */
185     /* Double the amount of total space in the list */
186     if (current_space->local_remaining<cnzi) {
187       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
188       ndouble++;
189     }
190 
191     /* Copy data into free space, then initialize lnk */
192     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
193 
194     current_space->array           += cnzi;
195     current_space->local_used      += cnzi;
196     current_space->local_remaining -= cnzi;
197 
198     ci[i+1] = ci[i] + cnzi;
199   }
200 
201   /* Column indices are in the list of free space */
202   /* Allocate space for cj, initialize cj, and */
203   /* destroy list of free space and other temporary array(s) */
204   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
205   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
206   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
207 
208   /* put together the new symbolic matrix */
209   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
210   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
211 
212   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
213   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
214   c                         = (Mat_SeqAIJ*)(C->data);
215   c->free_a                 = PETSC_FALSE;
216   c->free_ij                = PETSC_TRUE;
217   c->nonew                  = 0;
218 
219   /* fast, needs non-scalable O(bn) array 'abdense' */
220   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
221 
222   /* set MatInfo */
223   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
224   if (afill < 1.0) afill = 1.0;
225   c->maxnz                     = ci[am];
226   c->nz                        = ci[am];
227   C->info.mallocs           = ndouble;
228   C->info.fill_ratio_given  = fill;
229   C->info.fill_ratio_needed = afill;
230 
231 #if defined(PETSC_USE_INFO)
232   if (ci[am]) {
233     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
234     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
235   } else {
236     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
237   }
238 #endif
239   PetscFunctionReturn(0);
240 }
241 
242 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
243 {
244   PetscErrorCode ierr;
245   PetscLogDouble flops=0.0;
246   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
247   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
248   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
249   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
250   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
251   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
252   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
253   PetscScalar    *ab_dense;
254 
255   PetscFunctionBegin;
256   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
257     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
258     c->a      = ca;
259     c->free_a = PETSC_TRUE;
260   } else {
261     ca        = c->a;
262   }
263   if (!c->matmult_abdense) {
264     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
265     c->matmult_abdense = ab_dense;
266   } else {
267     ab_dense = c->matmult_abdense;
268   }
269 
270   /* clean old values in C */
271   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
272   /* Traverse A row-wise. */
273   /* Build the ith row in C by summing over nonzero columns in A, */
274   /* the rows of B corresponding to nonzeros of A. */
275   for (i=0; i<am; i++) {
276     anzi = ai[i+1] - ai[i];
277     for (j=0; j<anzi; j++) {
278       brow = aj[j];
279       bnzi = bi[brow+1] - bi[brow];
280       bjj  = bj + bi[brow];
281       baj  = ba + bi[brow];
282       /* perform dense axpy */
283       valtmp = aa[j];
284       for (k=0; k<bnzi; k++) {
285         ab_dense[bjj[k]] += valtmp*baj[k];
286       }
287       flops += 2*bnzi;
288     }
289     aj += anzi; aa += anzi;
290 
291     cnzi = ci[i+1] - ci[i];
292     for (k=0; k<cnzi; k++) {
293       ca[k]          += ab_dense[cj[k]];
294       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
295     }
296     flops += cnzi;
297     cj    += cnzi; ca += cnzi;
298   }
299   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
300   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304 
305 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
306 {
307   PetscErrorCode ierr;
308   PetscLogDouble flops=0.0;
309   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
310   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
311   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
312   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
313   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
314   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
315   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
316   PetscInt       nextb;
317 
318   PetscFunctionBegin;
319   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
320     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
321     c->a      = ca;
322     c->free_a = PETSC_TRUE;
323   }
324 
325   /* clean old values in C */
326   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
327   /* Traverse A row-wise. */
328   /* Build the ith row in C by summing over nonzero columns in A, */
329   /* the rows of B corresponding to nonzeros of A. */
330   for (i=0; i<am; i++) {
331     anzi = ai[i+1] - ai[i];
332     cnzi = ci[i+1] - ci[i];
333     for (j=0; j<anzi; j++) {
334       brow = aj[j];
335       bnzi = bi[brow+1] - bi[brow];
336       bjj  = bj + bi[brow];
337       baj  = ba + bi[brow];
338       /* perform sparse axpy */
339       valtmp = aa[j];
340       nextb  = 0;
341       for (k=0; nextb<bnzi; k++) {
342         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
343           ca[k] += valtmp*baj[nextb++];
344         }
345       }
346       flops += 2*bnzi;
347     }
348     aj += anzi; aa += anzi;
349     cj += cnzi; ca += cnzi;
350   }
351 
352   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
353   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
354   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
355   PetscFunctionReturn(0);
356 }
357 
358 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
359 {
360   PetscErrorCode     ierr;
361   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
362   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
363   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
364   MatScalar          *ca;
365   PetscReal          afill;
366   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
367   PetscTable         ta;
368   PetscFreeSpaceList free_space=NULL,current_space=NULL;
369 
370   PetscFunctionBegin;
371   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
372   /*-----------------------------------------------------------------------------------------*/
373   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
374   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
375   ci[0] = 0;
376 
377   /* create and initialize a linked list */
378   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
379   MatRowMergeMax_SeqAIJ(b,bm,ta);
380   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
381   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
382 
383   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
384 
385   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
386   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
387   current_space = free_space;
388 
389   /* Determine ci and cj */
390   for (i=0; i<am; i++) {
391     anzi = ai[i+1] - ai[i];
392     aj   = a->j + ai[i];
393     for (j=0; j<anzi; j++) {
394       brow = aj[j];
395       bnzj = bi[brow+1] - bi[brow];
396       bj   = b->j + bi[brow];
397       /* add non-zero cols of B into the sorted linked list lnk */
398       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
399     }
400     cnzi = lnk[1];
401 
402     /* If free space is not available, make more free space */
403     /* Double the amount of total space in the list */
404     if (current_space->local_remaining<cnzi) {
405       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
406       ndouble++;
407     }
408 
409     /* Copy data into free space, then initialize lnk */
410     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
411 
412     current_space->array           += cnzi;
413     current_space->local_used      += cnzi;
414     current_space->local_remaining -= cnzi;
415 
416     ci[i+1] = ci[i] + cnzi;
417   }
418 
419   /* Column indices are in the list of free space */
420   /* Allocate space for cj, initialize cj, and */
421   /* destroy list of free space and other temporary array(s) */
422   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
423   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
424   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
425 
426   /* Allocate space for ca */
427   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
428 
429   /* put together the new symbolic matrix */
430   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
431   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
432 
433   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
434   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
435   c          = (Mat_SeqAIJ*)(C->data);
436   c->free_a  = PETSC_TRUE;
437   c->free_ij = PETSC_TRUE;
438   c->nonew   = 0;
439 
440   /* slower, less memory */
441   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
442 
443   /* set MatInfo */
444   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
445   if (afill < 1.0) afill = 1.0;
446   c->maxnz                     = ci[am];
447   c->nz                        = ci[am];
448   C->info.mallocs           = ndouble;
449   C->info.fill_ratio_given  = fill;
450   C->info.fill_ratio_needed = afill;
451 
452 #if defined(PETSC_USE_INFO)
453   if (ci[am]) {
454     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
455     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
456   } else {
457     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
458   }
459 #endif
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
464 {
465   PetscErrorCode     ierr;
466   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
467   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
468   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
469   MatScalar          *ca;
470   PetscReal          afill;
471   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
472   PetscTable         ta;
473   PetscFreeSpaceList free_space=NULL,current_space=NULL;
474 
475   PetscFunctionBegin;
476   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
477   /*---------------------------------------------------------------------------------------------*/
478   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
479   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
480   ci[0] = 0;
481 
482   /* create and initialize a linked list */
483   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
484   MatRowMergeMax_SeqAIJ(b,bm,ta);
485   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
486   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
487   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
488 
489   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
490   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
491   current_space = free_space;
492 
493   /* Determine ci and cj */
494   for (i=0; i<am; i++) {
495     anzi = ai[i+1] - ai[i];
496     aj   = a->j + ai[i];
497     for (j=0; j<anzi; j++) {
498       brow = aj[j];
499       bnzj = bi[brow+1] - bi[brow];
500       bj   = b->j + bi[brow];
501       /* add non-zero cols of B into the sorted linked list lnk */
502       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
503     }
504     cnzi = lnk[0];
505 
506     /* If free space is not available, make more free space */
507     /* Double the amount of total space in the list */
508     if (current_space->local_remaining<cnzi) {
509       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
510       ndouble++;
511     }
512 
513     /* Copy data into free space, then initialize lnk */
514     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
515 
516     current_space->array           += cnzi;
517     current_space->local_used      += cnzi;
518     current_space->local_remaining -= cnzi;
519 
520     ci[i+1] = ci[i] + cnzi;
521   }
522 
523   /* Column indices are in the list of free space */
524   /* Allocate space for cj, initialize cj, and */
525   /* destroy list of free space and other temporary array(s) */
526   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
527   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
528   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
529 
530   /* Allocate space for ca */
531   /*-----------------------*/
532   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
533 
534   /* put together the new symbolic matrix */
535   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
536   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
537 
538   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
539   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
540   c          = (Mat_SeqAIJ*)(C->data);
541   c->free_a  = PETSC_TRUE;
542   c->free_ij = PETSC_TRUE;
543   c->nonew   = 0;
544 
545   /* slower, less memory */
546   C->ops->matmultnumeric    = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
547 
548   /* set MatInfo */
549   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
550   if (afill < 1.0) afill = 1.0;
551   c->maxnz                     = ci[am];
552   c->nz                        = ci[am];
553   C->info.mallocs           = ndouble;
554   C->info.fill_ratio_given  = fill;
555   C->info.fill_ratio_needed = afill;
556 
557 #if defined(PETSC_USE_INFO)
558   if (ci[am]) {
559     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
560     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
561   } else {
562     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
563   }
564 #endif
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
569 {
570   PetscErrorCode     ierr;
571   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
572   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
573   PetscInt           *ci,*cj,*bb;
574   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
575   PetscReal          afill;
576   PetscInt           i,j,col,ndouble = 0;
577   PetscFreeSpaceList free_space=NULL,current_space=NULL;
578   PetscHeap          h;
579 
580   PetscFunctionBegin;
581   /* Get ci and cj - by merging sorted rows using a heap */
582   /*---------------------------------------------------------------------------------------------*/
583   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
584   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
585   ci[0] = 0;
586 
587   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
588   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
589   current_space = free_space;
590 
591   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
592   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
593 
594   /* Determine ci and cj */
595   for (i=0; i<am; i++) {
596     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
597     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
598     ci[i+1] = ci[i];
599     /* Populate the min heap */
600     for (j=0; j<anzi; j++) {
601       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
602       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
603         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
604       }
605     }
606     /* Pick off the min element, adding it to free space */
607     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
608     while (j >= 0) {
609       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
610         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
611         ndouble++;
612       }
613       *(current_space->array++) = col;
614       current_space->local_used++;
615       current_space->local_remaining--;
616       ci[i+1]++;
617 
618       /* stash if anything else remains in this row of B */
619       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
620       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
621         PetscInt j2,col2;
622         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
623         if (col2 != col) break;
624         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
625         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
626       }
627       /* Put any stashed elements back into the min heap */
628       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
629       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
630     }
631   }
632   ierr = PetscFree(bb);CHKERRQ(ierr);
633   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
634 
635   /* Column indices are in the list of free space */
636   /* Allocate space for cj, initialize cj, and */
637   /* destroy list of free space and other temporary array(s) */
638   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
639   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
640 
641   /* put together the new symbolic matrix */
642   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
643   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
644 
645   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
646   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
647   c          = (Mat_SeqAIJ*)(C->data);
648   c->free_a  = PETSC_TRUE;
649   c->free_ij = PETSC_TRUE;
650   c->nonew   = 0;
651 
652   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
653 
654   /* set MatInfo */
655   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
656   if (afill < 1.0) afill = 1.0;
657   c->maxnz                     = ci[am];
658   c->nz                        = ci[am];
659   C->info.mallocs           = ndouble;
660   C->info.fill_ratio_given  = fill;
661   C->info.fill_ratio_needed = afill;
662 
663 #if defined(PETSC_USE_INFO)
664   if (ci[am]) {
665     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
666     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
667   } else {
668     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
669   }
670 #endif
671   PetscFunctionReturn(0);
672 }
673 
674 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
675 {
676   PetscErrorCode     ierr;
677   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
678   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
679   PetscInt           *ci,*cj,*bb;
680   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
681   PetscReal          afill;
682   PetscInt           i,j,col,ndouble = 0;
683   PetscFreeSpaceList free_space=NULL,current_space=NULL;
684   PetscHeap          h;
685   PetscBT            bt;
686 
687   PetscFunctionBegin;
688   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
689   /*---------------------------------------------------------------------------------------------*/
690   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
691   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
692   ci[0] = 0;
693 
694   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
695   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
696 
697   current_space = free_space;
698 
699   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
700   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
701   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
702 
703   /* Determine ci and cj */
704   for (i=0; i<am; i++) {
705     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
706     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
707     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
708     ci[i+1] = ci[i];
709     /* Populate the min heap */
710     for (j=0; j<anzi; j++) {
711       PetscInt brow = acol[j];
712       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
713         PetscInt bcol = bj[bb[j]];
714         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
715           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
716           bb[j]++;
717           break;
718         }
719       }
720     }
721     /* Pick off the min element, adding it to free space */
722     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
723     while (j >= 0) {
724       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
725         fptr = NULL;                      /* need PetscBTMemzero */
726         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
727         ndouble++;
728       }
729       *(current_space->array++) = col;
730       current_space->local_used++;
731       current_space->local_remaining--;
732       ci[i+1]++;
733 
734       /* stash if anything else remains in this row of B */
735       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
736         PetscInt bcol = bj[bb[j]];
737         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
738           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
739           bb[j]++;
740           break;
741         }
742       }
743       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
744     }
745     if (fptr) {                 /* Clear the bits for this row */
746       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
747     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
748       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
749     }
750   }
751   ierr = PetscFree(bb);CHKERRQ(ierr);
752   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
753   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
754 
755   /* Column indices are in the list of free space */
756   /* Allocate space for cj, initialize cj, and */
757   /* destroy list of free space and other temporary array(s) */
758   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
759   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
760 
761   /* put together the new symbolic matrix */
762   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
763   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
764 
765   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
766   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
767   c          = (Mat_SeqAIJ*)(C->data);
768   c->free_a  = PETSC_TRUE;
769   c->free_ij = PETSC_TRUE;
770   c->nonew   = 0;
771 
772   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
773 
774   /* set MatInfo */
775   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
776   if (afill < 1.0) afill = 1.0;
777   c->maxnz                     = ci[am];
778   c->nz                        = ci[am];
779   C->info.mallocs           = ndouble;
780   C->info.fill_ratio_given  = fill;
781   C->info.fill_ratio_needed = afill;
782 
783 #if defined(PETSC_USE_INFO)
784   if (ci[am]) {
785     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
786     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
787   } else {
788     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
789   }
790 #endif
791   PetscFunctionReturn(0);
792 }
793 
794 
795 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
796 {
797   PetscErrorCode     ierr;
798   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
799   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
800   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
801   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
802   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
803   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
804   const PetscInt     *brow_ptr[8],*brow_end[8];
805   PetscInt           window[8];
806   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
807   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
808   PetscReal          afill;
809   PetscInt           *workj_L1,*workj_L2,*workj_L3;
810   PetscInt           L1_nnz,L2_nnz;
811 
812   /* Step 1: Get upper bound on memory required for allocation.
813              Because of the way virtual memory works,
814              only the memory pages that are actually needed will be physically allocated. */
815   PetscFunctionBegin;
816   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
817   for (i=0; i<am; i++) {
818     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
819     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
820     a_rownnz = 0;
821     for (k=0; k<anzi; ++k) {
822       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
823       if (a_rownnz > bn) {
824         a_rownnz = bn;
825         break;
826       }
827     }
828     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
829   }
830   /* temporary work areas for merging rows */
831   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
832   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
833   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
834 
835   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
836   c_maxmem = 8*(ai[am]+bi[bm]);
837   /* Step 2: Populate pattern for C */
838   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
839 
840   ci_nnz       = 0;
841   ci[0]        = 0;
842   worki_L1[0]  = 0;
843   worki_L2[0]  = 0;
844   for (i=0; i<am; i++) {
845     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
846     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
847     rowsleft             = anzi;
848     inputcol_L1          = acol;
849     L2_nnz               = 0;
850     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
851     worki_L2[1]          = 0;
852     outputi_nnz          = 0;
853 
854     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
855     while (ci_nnz+a_maxrownnz > c_maxmem) {
856       c_maxmem *= 2;
857       ndouble++;
858       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
859     }
860 
861     while (rowsleft) {
862       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
863       L1_nrows    = 0;
864       L1_nnz      = 0;
865       inputcol    = inputcol_L1;
866       inputi      = bi;
867       inputj      = bj;
868 
869       /* The following macro is used to specialize for small rows in A.
870          This helps with compiler unrolling, improving performance substantially.
871           Input:  inputj   inputi  inputcol  bn
872           Output: outputj  outputi_nnz                       */
873        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
874          window_min  = bn;                                                   \
875          outputi_nnz = 0;                                                    \
876          for (k=0; k<ANNZ; ++k) {                                            \
877            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
878            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
879            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
880            window_min  = PetscMin(window[k], window_min);                    \
881          }                                                                   \
882          while (window_min < bn) {                                           \
883            outputj[outputi_nnz++] = window_min;                              \
884            /* advance front and compute new minimum */                       \
885            old_window_min = window_min;                                      \
886            window_min = bn;                                                  \
887            for (k=0; k<ANNZ; ++k) {                                          \
888              if (window[k] == old_window_min) {                              \
889                brow_ptr[k]++;                                                \
890                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
891              }                                                               \
892              window_min = PetscMin(window[k], window_min);                   \
893            }                                                                 \
894          }
895 
896       /************** L E V E L  1 ***************/
897       /* Merge up to 8 rows of B to L1 work array*/
898       while (L1_rowsleft) {
899         outputi_nnz = 0;
900         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
901         else           outputj = cj + ci_nnz;           /* Merge directly to C */
902 
903         switch (L1_rowsleft) {
904         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
905                  brow_end[0] = inputj + inputi[inputcol[0]+1];
906                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
907                  inputcol    += L1_rowsleft;
908                  rowsleft    -= L1_rowsleft;
909                  L1_rowsleft  = 0;
910                  break;
911         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
912                  inputcol    += L1_rowsleft;
913                  rowsleft    -= L1_rowsleft;
914                  L1_rowsleft  = 0;
915                  break;
916         case 3: MatMatMultSymbolic_RowMergeMacro(3);
917                  inputcol    += L1_rowsleft;
918                  rowsleft    -= L1_rowsleft;
919                  L1_rowsleft  = 0;
920                  break;
921         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
922                  inputcol    += L1_rowsleft;
923                  rowsleft    -= L1_rowsleft;
924                  L1_rowsleft  = 0;
925                  break;
926         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
927                  inputcol    += L1_rowsleft;
928                  rowsleft    -= L1_rowsleft;
929                  L1_rowsleft  = 0;
930                  break;
931         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
932                  inputcol    += L1_rowsleft;
933                  rowsleft    -= L1_rowsleft;
934                  L1_rowsleft  = 0;
935                  break;
936         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
937                  inputcol    += L1_rowsleft;
938                  rowsleft    -= L1_rowsleft;
939                  L1_rowsleft  = 0;
940                  break;
941         default: MatMatMultSymbolic_RowMergeMacro(8);
942                  inputcol    += 8;
943                  rowsleft    -= 8;
944                  L1_rowsleft -= 8;
945                  break;
946         }
947         inputcol_L1           = inputcol;
948         L1_nnz               += outputi_nnz;
949         worki_L1[++L1_nrows]  = L1_nnz;
950       }
951 
952       /********************** L E V E L  2 ************************/
953       /* Merge from L1 work array to either C or to L2 work array */
954       if (anzi > 8) {
955         inputi      = worki_L1;
956         inputj      = workj_L1;
957         inputcol    = workcol;
958         outputi_nnz = 0;
959 
960         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
961         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
962 
963         switch (L1_nrows) {
964         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
965                  brow_end[0] = inputj + inputi[inputcol[0]+1];
966                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
967                  break;
968         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
969         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
970         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
971         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
972         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
973         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
974         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
975         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
976         }
977         L2_nnz               += outputi_nnz;
978         worki_L2[++L2_nrows]  = L2_nnz;
979 
980         /************************ L E V E L  3 **********************/
981         /* Merge from L2 work array to either C or to L2 work array */
982         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
983           inputi      = worki_L2;
984           inputj      = workj_L2;
985           inputcol    = workcol;
986           outputi_nnz = 0;
987           if (rowsleft) outputj = workj_L3;
988           else          outputj = cj + ci_nnz;
989           switch (L2_nrows) {
990           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
991                    brow_end[0] = inputj + inputi[inputcol[0]+1];
992                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
993                    break;
994           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
995           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
996           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
997           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
998           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
999           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1000           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1001           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1002           }
1003           L2_nrows    = 1;
1004           L2_nnz      = outputi_nnz;
1005           worki_L2[1] = outputi_nnz;
1006           /* Copy to workj_L2 */
1007           if (rowsleft) {
1008             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1009           }
1010         }
1011       }
1012     }  /* while (rowsleft) */
1013 #undef MatMatMultSymbolic_RowMergeMacro
1014 
1015     /* terminate current row */
1016     ci_nnz += outputi_nnz;
1017     ci[i+1] = ci_nnz;
1018   }
1019 
1020   /* Step 3: Create the new symbolic matrix */
1021   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1022   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1023 
1024   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1025   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1026   c          = (Mat_SeqAIJ*)(C->data);
1027   c->free_a  = PETSC_TRUE;
1028   c->free_ij = PETSC_TRUE;
1029   c->nonew   = 0;
1030 
1031   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1032 
1033   /* set MatInfo */
1034   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1035   if (afill < 1.0) afill = 1.0;
1036   c->maxnz                     = ci[am];
1037   c->nz                        = ci[am];
1038   C->info.mallocs           = ndouble;
1039   C->info.fill_ratio_given  = fill;
1040   C->info.fill_ratio_needed = afill;
1041 
1042 #if defined(PETSC_USE_INFO)
1043   if (ci[am]) {
1044     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1045     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1046   } else {
1047     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1048   }
1049 #endif
1050 
1051   /* Step 4: Free temporary work areas */
1052   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1053   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1054   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 /* concatenate unique entries and then sort */
1059 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1060 {
1061   PetscErrorCode     ierr;
1062   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1063   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1064   PetscInt           *ci,*cj;
1065   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1066   PetscReal          afill;
1067   PetscInt           i,j,ndouble = 0;
1068   PetscSegBuffer     seg,segrow;
1069   char               *seen;
1070 
1071   PetscFunctionBegin;
1072   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1073   ci[0] = 0;
1074 
1075   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1076   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1077   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1078   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1079 
1080   /* Determine ci and cj */
1081   for (i=0; i<am; i++) {
1082     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1083     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1084     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1085     /* Pack segrow */
1086     for (j=0; j<anzi; j++) {
1087       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1088       for (k=bjstart; k<bjend; k++) {
1089         PetscInt bcol = bj[k];
1090         if (!seen[bcol]) { /* new entry */
1091           PetscInt *PETSC_RESTRICT slot;
1092           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1093           *slot = bcol;
1094           seen[bcol] = 1;
1095           packlen++;
1096         }
1097       }
1098     }
1099     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1100     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1101     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1102     ci[i+1] = ci[i] + packlen;
1103     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1104   }
1105   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1106   ierr = PetscFree(seen);CHKERRQ(ierr);
1107 
1108   /* Column indices are in the segmented buffer */
1109   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1110   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1111 
1112   /* put together the new symbolic matrix */
1113   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);CHKERRQ(ierr);
1114   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1115 
1116   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1117   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1118   c          = (Mat_SeqAIJ*)(C->data);
1119   c->free_a  = PETSC_TRUE;
1120   c->free_ij = PETSC_TRUE;
1121   c->nonew   = 0;
1122 
1123   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1124 
1125   /* set MatInfo */
1126   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1127   if (afill < 1.0) afill = 1.0;
1128   c->maxnz                     = ci[am];
1129   c->nz                        = ci[am];
1130   C->info.mallocs           = ndouble;
1131   C->info.fill_ratio_given  = fill;
1132   C->info.fill_ratio_needed = afill;
1133 
1134 #if defined(PETSC_USE_INFO)
1135   if (ci[am]) {
1136     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1137     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1138   } else {
1139     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1140   }
1141 #endif
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
1146 {
1147   PetscErrorCode      ierr;
1148   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
1149   Mat_MatMatTransMult *abt=a->abt;
1150 
1151   PetscFunctionBegin;
1152   ierr = (abt->destroy)(A);CHKERRQ(ierr);
1153   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
1154   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
1155   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
1156   ierr = PetscFree(abt);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1161 {
1162   PetscErrorCode      ierr;
1163   Mat                 Bt;
1164   PetscInt            *bti,*btj;
1165   Mat_MatMatTransMult *abt;
1166   Mat_SeqAIJ          *c;
1167   Mat_Product         *product = C->product;
1168   MatProductAlgorithm alg = product->alg;
1169 
1170   PetscFunctionBegin;
1171   /* create symbolic Bt */
1172   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1173   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
1174   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1175   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
1176 
1177   /* get symbolic C=A*Bt */
1178   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
1179   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
1180   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
1181 
1182   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1183   ierr   = PetscNew(&abt);CHKERRQ(ierr);
1184   c      = (Mat_SeqAIJ*)C->data;
1185   c->abt = abt;
1186 
1187   abt->usecoloring = PETSC_FALSE;
1188   abt->destroy     = C->ops->destroy;
1189   C->ops->destroy  = MatDestroy_SeqAIJ_MatMatMultTrans;
1190   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1191 
1192   abt->usecoloring = PETSC_FALSE;
1193   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
1194   if (abt->usecoloring) {
1195     /* Create MatTransposeColoring from symbolic C=A*B^T */
1196     MatTransposeColoring matcoloring;
1197     MatColoring          coloring;
1198     ISColoring           iscoloring;
1199     Mat                  Bt_dense,C_dense;
1200 
1201     /* inode causes memory problem */
1202     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
1203 
1204     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1205     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1206     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1207     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1208     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1209     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1210     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
1211 
1212     abt->matcoloring = matcoloring;
1213 
1214     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
1215 
1216     /* Create Bt_dense and C_dense = A*Bt_dense */
1217     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
1218     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1219     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
1220     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
1221 
1222     Bt_dense->assembled = PETSC_TRUE;
1223     abt->Bt_den         = Bt_dense;
1224 
1225     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
1226     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1227     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
1228     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
1229 
1230     Bt_dense->assembled = PETSC_TRUE;
1231     abt->ABt_den  = C_dense;
1232 
1233 #if defined(PETSC_USE_INFO)
1234     {
1235       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1236       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
1237     }
1238 #endif
1239   }
1240   /* clean up */
1241   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1242   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1247 {
1248   PetscErrorCode      ierr;
1249   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1250   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1251   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1252   PetscLogDouble      flops=0.0;
1253   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1254   Mat_MatMatTransMult *abt = c->abt;
1255 
1256   PetscFunctionBegin;
1257   /* clear old values in C */
1258   if (!c->a) {
1259     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1260     c->a      = ca;
1261     c->free_a = PETSC_TRUE;
1262   } else {
1263     ca =  c->a;
1264     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
1265   }
1266 
1267   if (abt->usecoloring) {
1268     MatTransposeColoring matcoloring = abt->matcoloring;
1269     Mat                  Bt_dense,C_dense = abt->ABt_den;
1270 
1271     /* Get Bt_dense by Apply MatTransposeColoring to B */
1272     Bt_dense = abt->Bt_den;
1273     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1274 
1275     /* C_dense = A*Bt_dense */
1276     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1277 
1278     /* Recover C from C_dense */
1279     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1280     PetscFunctionReturn(0);
1281   }
1282 
1283   for (i=0; i<cm; i++) {
1284     anzi = ai[i+1] - ai[i];
1285     acol = aj + ai[i];
1286     aval = aa + ai[i];
1287     cnzi = ci[i+1] - ci[i];
1288     ccol = cj + ci[i];
1289     cval = ca + ci[i];
1290     for (j=0; j<cnzi; j++) {
1291       brow = ccol[j];
1292       bnzj = bi[brow+1] - bi[brow];
1293       bcol = bj + bi[brow];
1294       bval = ba + bi[brow];
1295 
1296       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1297       nexta = 0; nextb = 0;
1298       while (nexta<anzi && nextb<bnzj) {
1299         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1300         if (nexta == anzi) break;
1301         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1302         if (nextb == bnzj) break;
1303         if (acol[nexta] == bcol[nextb]) {
1304           cval[j] += aval[nexta]*bval[nextb];
1305           nexta++; nextb++;
1306           flops += 2;
1307         }
1308       }
1309     }
1310   }
1311   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1312   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1313   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
1318 {
1319   PetscErrorCode      ierr;
1320   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
1321   Mat_MatTransMatMult *atb = a->atb;
1322 
1323   PetscFunctionBegin;
1324   if (atb) {
1325     ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
1326     ierr = (*atb->destroy)(A);CHKERRQ(ierr);
1327   }
1328   ierr = PetscFree(atb);CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1333 {
1334   PetscErrorCode      ierr;
1335   Mat                 At;
1336   PetscInt            *ati,*atj;
1337   Mat_Product         *product = C->product;
1338   MatProductAlgorithm alg;
1339   PetscBool           flg;
1340 
1341   PetscFunctionBegin;
1342   if (product) {
1343     alg = product->alg;
1344   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
1345 
1346   /* outerproduct */
1347   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
1348   if (flg) {
1349     /* create symbolic At */
1350     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1351     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1352     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1353     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1354 
1355     /* get symbolic C=At*B */
1356     product->alg = "sorted";
1357     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1358 
1359     /* clean up */
1360     ierr = MatDestroy(&At);CHKERRQ(ierr);
1361     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1362 
1363     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1364     PetscFunctionReturn(0);
1365   }
1366 
1367   /* matmatmult */
1368   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
1369   if (flg) {
1370     Mat_MatTransMatMult *atb;
1371     Mat_SeqAIJ          *c;
1372 
1373     ierr = PetscNew(&atb);CHKERRQ(ierr);
1374     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1375     product->alg = "sorted";
1376     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1377 
1378     c               = (Mat_SeqAIJ*)C->data;
1379     c->atb          = atb;
1380     atb->At         = At;
1381     atb->destroy    = C->ops->destroy;
1382     atb->updateAt   = PETSC_FALSE; /* because At is computed here */
1383     C->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1384 
1385     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1386     PetscFunctionReturn(0);
1387   }
1388 
1389   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1394 {
1395   PetscErrorCode ierr;
1396   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1397   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1398   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1399   PetscLogDouble flops=0.0;
1400   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1401 
1402   PetscFunctionBegin;
1403   if (!c->a) {
1404     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1405 
1406     c->a      = ca;
1407     c->free_a = PETSC_TRUE;
1408   } else {
1409     ca   = c->a;
1410     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1411   }
1412 
1413   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1414   for (i=0; i<am; i++) {
1415     bj   = b->j + bi[i];
1416     ba   = b->a + bi[i];
1417     bnzi = bi[i+1] - bi[i];
1418     anzi = ai[i+1] - ai[i];
1419     for (j=0; j<anzi; j++) {
1420       nextb = 0;
1421       crow  = *aj++;
1422       cjj   = cj + ci[crow];
1423       caj   = ca + ci[crow];
1424       /* perform sparse axpy operation.  Note cjj includes bj. */
1425       for (k=0; nextb<bnzi; k++) {
1426         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1427           caj[k] += (*aa)*(*(ba+nextb));
1428           nextb++;
1429         }
1430       }
1431       flops += 2*bnzi;
1432       aa++;
1433     }
1434   }
1435 
1436   /* Assemble the final matrix and clean up */
1437   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1438   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1439   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1444 {
1445   PetscErrorCode ierr;
1446 
1447   PetscFunctionBegin;
1448   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1449 
1450   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1455 {
1456   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1457   Mat_SeqDense      *bd=(Mat_SeqDense*)B->data;
1458   Mat_SeqDense      *cd=(Mat_SeqDense*)C->data;
1459   PetscErrorCode    ierr;
1460   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1461   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1462   const PetscInt    *aj;
1463   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1464   PetscInt          clda=cd->lda;
1465   PetscInt          am4=4*clda,bm4=4*bm,col,i,j,n;
1466 
1467   PetscFunctionBegin;
1468   if (!cm || !cn) PetscFunctionReturn(0);
1469   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
1470   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1471   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1472   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1473   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1474   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1475     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1476       r1 = r2 = r3 = r4 = 0.0;
1477       n  = a->i[i+1] - a->i[i];
1478       aj = a->j + a->i[i];
1479       aa = av + a->i[i];
1480       for (j=0; j<n; j++) {
1481         const PetscScalar aatmp = aa[j];
1482         const PetscInt    ajtmp = aj[j];
1483         r1 += aatmp*b1[ajtmp];
1484         r2 += aatmp*b2[ajtmp];
1485         r3 += aatmp*b3[ajtmp];
1486         r4 += aatmp*b4[ajtmp];
1487       }
1488       c1[i] += r1;
1489       c2[i] += r2;
1490       c3[i] += r3;
1491       c4[i] += r4;
1492     }
1493     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1494     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1495   }
1496   for (; col<cn; col++) {   /* over extra columns of C */
1497     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1498       r1 = 0.0;
1499       n  = a->i[i+1] - a->i[i];
1500       aj = a->j + a->i[i];
1501       aa = av + a->i[i];
1502       for (j=0; j<n; j++) {
1503         r1 += aa[j]*b1[aj[j]];
1504       }
1505       c1[i] += r1;
1506     }
1507     b1 += bm;
1508     c1 += clda;
1509   }
1510   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1511   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1512   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1513   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1514   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1515   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1520 {
1521   PetscErrorCode ierr;
1522 
1523   PetscFunctionBegin;
1524   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1525   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1526   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1527 
1528   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1529   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 /* ------------------------------------------------------- */
1534 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1535 {
1536   PetscFunctionBegin;
1537   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1538   C->ops->productsymbolic = MatProductSymbolic_AB;
1539   /* dense mat may not call MatProductSymbolic(), thus set C->ops->productnumeric here */
1540   C->ops->productnumeric  = MatProductNumeric_AB;
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1545 {
1546   PetscFunctionBegin;
1547   C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqAIJ_SeqDense;
1548   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1549   C->ops->productnumeric           = MatProductNumeric_AtB;
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1554 {
1555   PetscErrorCode ierr;
1556   Mat_Product    *product = C->product;
1557 
1558   PetscFunctionBegin;
1559   switch (product->type) {
1560   case MATPRODUCT_AB:
1561     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1562     break;
1563   case MATPRODUCT_AtB:
1564     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
1565     break;
1566   case MATPRODUCT_PtAP:
1567     ierr = MatProductSetFromOptions_SeqDense(C);CHKERRQ(ierr);
1568     break;
1569   default:
1570     /* Use MatProduct_Basic() if there is no specific implementation */
1571     C->ops->productsymbolic = MatProductSymbolic_Basic;
1572   }
1573   PetscFunctionReturn(0);
1574 }
1575 /* ------------------------------------------------------- */
1576 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1577 {
1578   PetscErrorCode ierr;
1579   Mat_Product    *product = C->product;
1580   Mat            A = product->A;
1581   PetscBool      baij;
1582 
1583   PetscFunctionBegin;
1584   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
1585   if (!baij) { /* A is seqsbaij */
1586     PetscBool sbaij;
1587     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
1588     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1589 
1590     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1591   } else { /* A is seqbaij */
1592     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1593   }
1594 
1595   C->ops->productsymbolic = MatProductSymbolic_AB;
1596   C->ops->productnumeric  = MatProductNumeric_AB;
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1601 {
1602   PetscErrorCode ierr;
1603   Mat_Product    *product = C->product;
1604 
1605   PetscFunctionBegin;
1606   if (product->type == MATPRODUCT_AB) {
1607     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1608   } else SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqXBAIJ and SeqDense matrices",MatProductTypes[product->type]);
1609   PetscFunctionReturn(0);
1610 }
1611 /* ------------------------------------------------------- */
1612 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1613 {
1614   PetscFunctionBegin;
1615   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1616   C->ops->productsymbolic = MatProductSymbolic_AB;
1617   C->ops->productnumeric  = MatProductNumeric_AB;
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1622 {
1623   PetscErrorCode ierr;
1624   Mat_Product    *product = C->product;
1625 
1626   PetscFunctionBegin;
1627   if (product->type == MATPRODUCT_AB) {
1628     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
1629   } else SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqDense and SeqAIJ matrices",MatProductTypes[product->type]);
1630   PetscFunctionReturn(0);
1631 }
1632 /* ------------------------------------------------------- */
1633 
1634 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1635 {
1636   PetscErrorCode ierr;
1637   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1638   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1639   PetscInt       *bi      = b->i,*bj=b->j;
1640   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1641   MatScalar      *btval,*btval_den,*ba=b->a;
1642   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1643 
1644   PetscFunctionBegin;
1645   btval_den=btdense->v;
1646   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
1647   for (k=0; k<ncolors; k++) {
1648     ncolumns = coloring->ncolumns[k];
1649     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1650       col   = *(columns + colorforcol[k] + l);
1651       btcol = bj + bi[col];
1652       btval = ba + bi[col];
1653       anz   = bi[col+1] - bi[col];
1654       for (j=0; j<anz; j++) {
1655         brow            = btcol[j];
1656         btval_den[brow] = btval[j];
1657       }
1658     }
1659     btval_den += m;
1660   }
1661   PetscFunctionReturn(0);
1662 }
1663 
1664 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1665 {
1666   PetscErrorCode    ierr;
1667   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1668   const PetscScalar *ca_den,*ca_den_ptr;
1669   PetscScalar       *ca=csp->a;
1670   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1671   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1672   PetscInt          nrows,*row,*idx;
1673   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1674 
1675   PetscFunctionBegin;
1676   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1677 
1678   if (brows > 0) {
1679     PetscInt *lstart,row_end,row_start;
1680     lstart = matcoloring->lstart;
1681     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1682 
1683     row_end = brows;
1684     if (row_end > m) row_end = m;
1685     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1686       ca_den_ptr = ca_den;
1687       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1688         nrows = matcoloring->nrows[k];
1689         row   = rows  + colorforrow[k];
1690         idx   = den2sp + colorforrow[k];
1691         for (l=lstart[k]; l<nrows; l++) {
1692           if (row[l] >= row_end) {
1693             lstart[k] = l;
1694             break;
1695           } else {
1696             ca[idx[l]] = ca_den_ptr[row[l]];
1697           }
1698         }
1699         ca_den_ptr += m;
1700       }
1701       row_end += brows;
1702       if (row_end > m) row_end = m;
1703     }
1704   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1705     ca_den_ptr = ca_den;
1706     for (k=0; k<ncolors; k++) {
1707       nrows = matcoloring->nrows[k];
1708       row   = rows  + colorforrow[k];
1709       idx   = den2sp + colorforrow[k];
1710       for (l=0; l<nrows; l++) {
1711         ca[idx[l]] = ca_den_ptr[row[l]];
1712       }
1713       ca_den_ptr += m;
1714     }
1715   }
1716 
1717   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1718 #if defined(PETSC_USE_INFO)
1719   if (matcoloring->brows > 0) {
1720     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1721   } else {
1722     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1723   }
1724 #endif
1725   PetscFunctionReturn(0);
1726 }
1727 
1728 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1729 {
1730   PetscErrorCode ierr;
1731   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1732   const PetscInt *is,*ci,*cj,*row_idx;
1733   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1734   IS             *isa;
1735   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1736   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1737   PetscInt       *colorforcol,*columns,*columns_i,brows;
1738   PetscBool      flg;
1739 
1740   PetscFunctionBegin;
1741   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1742 
1743   /* bs >1 is not being tested yet! */
1744   Nbs       = mat->cmap->N/bs;
1745   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1746   c->N      = Nbs;
1747   c->m      = c->M;
1748   c->rstart = 0;
1749   c->brows  = 100;
1750 
1751   c->ncolors = nis;
1752   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1753   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1754   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1755 
1756   brows = c->brows;
1757   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1758   if (flg) c->brows = brows;
1759   if (brows > 0) {
1760     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1761   }
1762 
1763   colorforrow[0] = 0;
1764   rows_i         = rows;
1765   den2sp_i       = den2sp;
1766 
1767   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1768   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
1769 
1770   colorforcol[0] = 0;
1771   columns_i      = columns;
1772 
1773   /* get column-wise storage of mat */
1774   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1775 
1776   cm   = c->m;
1777   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1778   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1779   for (i=0; i<nis; i++) { /* loop over color */
1780     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1781     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1782 
1783     c->ncolumns[i] = n;
1784     if (n) {
1785       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1786     }
1787     colorforcol[i+1] = colorforcol[i] + n;
1788     columns_i       += n;
1789 
1790     /* fast, crude version requires O(N*N) work */
1791     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1792 
1793     for (j=0; j<n; j++) { /* loop over columns*/
1794       col     = is[j];
1795       row_idx = cj + ci[col];
1796       m       = ci[col+1] - ci[col];
1797       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1798         idxhit[*row_idx]   = spidx[ci[col] + k];
1799         rowhit[*row_idx++] = col + 1;
1800       }
1801     }
1802     /* count the number of hits */
1803     nrows = 0;
1804     for (j=0; j<cm; j++) {
1805       if (rowhit[j]) nrows++;
1806     }
1807     c->nrows[i]      = nrows;
1808     colorforrow[i+1] = colorforrow[i] + nrows;
1809 
1810     nrows = 0;
1811     for (j=0; j<cm; j++) { /* loop over rows */
1812       if (rowhit[j]) {
1813         rows_i[nrows]   = j;
1814         den2sp_i[nrows] = idxhit[j];
1815         nrows++;
1816       }
1817     }
1818     den2sp_i += nrows;
1819 
1820     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1821     rows_i += nrows;
1822   }
1823   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1824   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1825   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1826   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1827 
1828   c->colorforrow = colorforrow;
1829   c->rows        = rows;
1830   c->den2sp      = den2sp;
1831   c->colorforcol = colorforcol;
1832   c->columns     = columns;
1833 
1834   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 /* --------------------------------------------------------------- */
1839 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1840 {
1841   PetscErrorCode ierr;
1842   Mat_Product    *product = C->product;
1843   Mat            A=product->A,B=product->B;
1844 
1845   PetscFunctionBegin;
1846   if (C->ops->mattransposemultnumeric) {
1847     /* Alg: "outerproduct" */
1848     ierr = (C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
1849   } else {
1850     /* Alg: "matmatmult" -- C = At*B */
1851     Mat_SeqAIJ          *c = (Mat_SeqAIJ*)C->data;
1852     Mat_MatTransMatMult *atb = c->atb;
1853     Mat                 At = atb->At;
1854 
1855     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1856       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
1857     }
1858     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
1859     atb->updateAt = PETSC_TRUE;
1860   }
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1865 {
1866   PetscErrorCode ierr;
1867   Mat_Product    *product = C->product;
1868   Mat            A=product->A,B=product->B;
1869   PetscReal      fill=product->fill;
1870 
1871   PetscFunctionBegin;
1872   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1873 
1874   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 /* --------------------------------------------------------------- */
1879 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1880 {
1881   PetscErrorCode ierr;
1882   Mat_Product    *product = C->product;
1883   PetscInt       alg = 0; /* default algorithm */
1884   PetscBool      flg = PETSC_FALSE;
1885 #if !defined(PETSC_HAVE_HYPRE)
1886   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1887   PetscInt       nalg = 7;
1888 #else
1889   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1890   PetscInt       nalg = 8;
1891 #endif
1892 
1893   PetscFunctionBegin;
1894   /* Set default algorithm */
1895   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1896   if (flg) {
1897     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1898   }
1899 
1900   /* Get runtime option */
1901   if (product->api_user) {
1902     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1903     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1904     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1905   } else {
1906     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1907     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1908     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1909   }
1910   if (flg) {
1911     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1912   }
1913 
1914   C->ops->productsymbolic = MatProductSymbolic_AB;
1915   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1920 {
1921   PetscErrorCode ierr;
1922   Mat_Product    *product = C->product;
1923   PetscInt       alg = 0; /* default algorithm */
1924   PetscBool      flg = PETSC_FALSE;
1925   const char     *algTypes[2] = {"at*b","outerproduct"};
1926   PetscInt       nalg = 2;
1927 
1928   PetscFunctionBegin;
1929   /* Set default algorithm */
1930   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
1931   if (flg) {
1932     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1933   }
1934 
1935   /* Get runtime option */
1936   if (product->api_user) {
1937     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1938     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1939     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1940   } else {
1941     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1942     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1943     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1944   }
1945   if (flg) {
1946     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1947   }
1948 
1949   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
1954 {
1955   PetscErrorCode ierr;
1956   Mat_Product    *product = C->product;
1957   PetscInt       alg = 0; /* default algorithm */
1958   PetscBool      flg = PETSC_FALSE;
1959   const char     *algTypes[2] = {"default","color"};
1960   PetscInt       nalg = 2;
1961 
1962   PetscFunctionBegin;
1963   /* Set default algorithm */
1964   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1965   if (!flg) {
1966     alg = 1;
1967     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1968   }
1969 
1970   /* Get runtime option */
1971   if (product->api_user) {
1972     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
1973     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1974     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1975   } else {
1976     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
1977     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1978     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1979   }
1980   if (flg) {
1981     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1982   }
1983 
1984   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
1985   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1986   PetscFunctionReturn(0);
1987 }
1988 
1989 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
1990 {
1991   PetscErrorCode ierr;
1992   Mat_Product    *product = C->product;
1993   PetscBool      flg = PETSC_FALSE;
1994   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
1995 #if !defined(PETSC_HAVE_HYPRE)
1996   const char      *algTypes[2] = {"scalable","rap"};
1997   PetscInt        nalg = 2;
1998 #else
1999   const char      *algTypes[3] = {"scalable","rap","hypre"};
2000   PetscInt        nalg = 3;
2001 #endif
2002 
2003   PetscFunctionBegin;
2004   /* Set default algorithm */
2005   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2006   if (flg) {
2007     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2008   }
2009 
2010   /* Get runtime option */
2011   if (product->api_user) {
2012     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2013     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2014     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2015   } else {
2016     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2017     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2018     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2019   }
2020   if (flg) {
2021     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2022   }
2023 
2024   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2025   PetscFunctionReturn(0);
2026 }
2027 
2028 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2029 {
2030   PetscErrorCode ierr;
2031   Mat_Product    *product = C->product;
2032   PetscBool      flg = PETSC_FALSE;
2033   PetscInt       alg = 0; /* default algorithm */
2034   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2035   PetscInt        nalg = 3;
2036 
2037   PetscFunctionBegin;
2038   /* Set default algorithm */
2039   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2040   if (flg) {
2041     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2042   }
2043 
2044   /* Get runtime option */
2045   if (product->api_user) {
2046     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2047     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2048     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2049   } else {
2050     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2051     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2052     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2053   }
2054   if (flg) {
2055     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2056   }
2057 
2058   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2059   PetscFunctionReturn(0);
2060 }
2061 
2062 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2063 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2064 {
2065   PetscErrorCode ierr;
2066   Mat_Product    *product = C->product;
2067   PetscInt       alg = 0; /* default algorithm */
2068   PetscBool      flg = PETSC_FALSE;
2069   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2070   PetscInt       nalg = 7;
2071 
2072   PetscFunctionBegin;
2073   /* Set default algorithm */
2074   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2075   if (flg) {
2076     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2077   }
2078 
2079   /* Get runtime option */
2080   if (product->api_user) {
2081     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2082     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2083     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2084   } else {
2085     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2086     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2087     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2088   }
2089   if (flg) {
2090     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2091   }
2092 
2093   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2094   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2095   PetscFunctionReturn(0);
2096 }
2097 
2098 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2099 {
2100   PetscErrorCode ierr;
2101   Mat_Product    *product = C->product;
2102 
2103   PetscFunctionBegin;
2104   switch (product->type) {
2105   case MATPRODUCT_AB:
2106     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
2107     break;
2108   case MATPRODUCT_AtB:
2109     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
2110     break;
2111   case MATPRODUCT_ABt:
2112     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
2113     break;
2114   case MATPRODUCT_PtAP:
2115     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
2116     break;
2117   case MATPRODUCT_RARt:
2118     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
2119     break;
2120   case MATPRODUCT_ABC:
2121     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
2122     break;
2123   default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported for SeqAIJ and SeqAIJ matrices",MatProductTypes[product->type]);
2124   }
2125   PetscFunctionReturn(0);
2126 }
2127