xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 6a98f8dc3f2c9149905a87dc2e9d0fedaf64e09a)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscErrorCode ierr;
16 
17   PetscFunctionBegin;
18   if (C->ops->matmultnumeric) {
19     ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
20   } else {
21     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);CHKERRQ(ierr);
22   }
23   PetscFunctionReturn(0);
24 }
25 
26 /* Modified from MatCreateSeqAIJWithArrays() */
27 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],Mat mat)
28 {
29   PetscErrorCode ierr;
30   PetscInt       ii;
31   Mat_SeqAIJ     *aij;
32 
33   PetscFunctionBegin;
34   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
35   ierr = MatSetSizes(mat,m,n,m,n);CHKERRQ(ierr);
36 
37   ierr = MatSetType(mat,MATSEQAIJ);CHKERRQ(ierr);
38   ierr = MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,0);CHKERRQ(ierr);
39   aij  = (Mat_SeqAIJ*)(mat)->data;
40   ierr = PetscMalloc1(m,&aij->imax);CHKERRQ(ierr);
41   ierr = PetscMalloc1(m,&aij->ilen);CHKERRQ(ierr);
42 
43   aij->i            = i;
44   aij->j            = j;
45   aij->a            = a;
46   aij->singlemalloc = PETSC_FALSE;
47   aij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
48   aij->free_a       = PETSC_FALSE;
49   aij->free_ij      = PETSC_FALSE;
50 
51   for (ii=0; ii<m; ii++) {
52     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
53   }
54 
55   ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
56   ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
57   PetscFunctionReturn(0);
58 }
59 
60 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
61 {
62   PetscErrorCode      ierr;
63   Mat_Product         *product = C->product;
64   MatProductAlgorithm alg;
65   PetscBool           flg;
66 
67   PetscFunctionBegin;
68   if (product) {
69     alg = product->alg;
70   } else {
71     alg = "sorted";
72   }
73 
74   /* sorted */
75   ierr = PetscStrcmp(alg,"sorted",&flg);CHKERRQ(ierr);
76   if (flg) {
77     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);CHKERRQ(ierr);
78     PetscFunctionReturn(0);
79   }
80 
81   /* scalable */
82   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
83   if (flg) {
84     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
85     PetscFunctionReturn(0);
86   }
87 
88   /* scalable_fast */
89   ierr = PetscStrcmp(alg,"scalable_fast",&flg);CHKERRQ(ierr);
90   if (flg) {
91     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
92     PetscFunctionReturn(0);
93   }
94 
95   /* heap */
96   ierr = PetscStrcmp(alg,"heap",&flg);CHKERRQ(ierr);
97   if (flg) {
98     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
99     PetscFunctionReturn(0);
100   }
101 
102   /* btheap */
103   ierr = PetscStrcmp(alg,"btheap",&flg);CHKERRQ(ierr);
104   if (flg) {
105     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
106     PetscFunctionReturn(0);
107   }
108 
109   /* llcondensed */
110   ierr = PetscStrcmp(alg,"llcondensed",&flg);CHKERRQ(ierr);
111   if (flg) {
112     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
113     PetscFunctionReturn(0);
114   }
115 
116   /* rowmerge */
117   ierr = PetscStrcmp(alg,"rowmerge",&flg);CHKERRQ(ierr);
118   if (flg) {
119     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);CHKERRQ(ierr);
120     PetscFunctionReturn(0);
121   }
122 
123 #if defined(PETSC_HAVE_HYPRE)
124   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
125   if (flg) {
126     ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
127     PetscFunctionReturn(0);
128   }
129 #endif
130 
131   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
136 {
137   PetscErrorCode     ierr;
138   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
139   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
140   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
141   PetscReal          afill;
142   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
143   PetscTable         ta;
144   PetscBT            lnkbt;
145   PetscFreeSpaceList free_space=NULL,current_space=NULL;
146 
147   PetscFunctionBegin;
148   /* Get ci and cj */
149   /*---------------*/
150   /* Allocate ci array, arrays for fill computation and */
151   /* free space for accumulating nonzero column info */
152   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
153   ci[0] = 0;
154 
155   /* create and initialize a linked list */
156   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
157   MatRowMergeMax_SeqAIJ(b,bm,ta);
158   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
159   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
160 
161   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
162 
163   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
164   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
165 
166   current_space = free_space;
167 
168   /* Determine ci and cj */
169   for (i=0; i<am; i++) {
170     anzi = ai[i+1] - ai[i];
171     aj   = a->j + ai[i];
172     for (j=0; j<anzi; j++) {
173       brow = aj[j];
174       bnzj = bi[brow+1] - bi[brow];
175       bj   = b->j + bi[brow];
176       /* add non-zero cols of B into the sorted linked list lnk */
177       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
178     }
179     cnzi = lnk[0];
180 
181     /* If free space is not available, make more free space */
182     /* Double the amount of total space in the list */
183     if (current_space->local_remaining<cnzi) {
184       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
185       ndouble++;
186     }
187 
188     /* Copy data into free space, then initialize lnk */
189     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
190 
191     current_space->array           += cnzi;
192     current_space->local_used      += cnzi;
193     current_space->local_remaining -= cnzi;
194 
195     ci[i+1] = ci[i] + cnzi;
196   }
197 
198   /* Column indices are in the list of free space */
199   /* Allocate space for cj, initialize cj, and */
200   /* destroy list of free space and other temporary array(s) */
201   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
202   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
203   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
204 
205   /* put together the new symbolic matrix */
206   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
207   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
208   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
209 
210   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
211   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
212   c                         = (Mat_SeqAIJ*)(C->data);
213   c->free_a                 = PETSC_FALSE;
214   c->free_ij                = PETSC_TRUE;
215   c->nonew                  = 0;
216 
217   /* fast, needs non-scalable O(bn) array 'abdense' */
218   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
219 
220   /* set MatInfo */
221   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
222   if (afill < 1.0) afill = 1.0;
223   c->maxnz                     = ci[am];
224   c->nz                        = ci[am];
225   C->info.mallocs           = ndouble;
226   C->info.fill_ratio_given  = fill;
227   C->info.fill_ratio_needed = afill;
228 
229 #if defined(PETSC_USE_INFO)
230   if (ci[am]) {
231     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
232     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
233   } else {
234     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
235   }
236 #endif
237   PetscFunctionReturn(0);
238 }
239 
240 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
241 {
242   PetscErrorCode ierr;
243   PetscLogDouble flops=0.0;
244   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
245   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
246   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
247   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
248   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
249   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
250   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
251   PetscScalar    *ab_dense;
252 
253   PetscFunctionBegin;
254   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
255     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
256     c->a      = ca;
257     c->free_a = PETSC_TRUE;
258   } else {
259     ca        = c->a;
260   }
261   if (!c->matmult_abdense) {
262     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
263     c->matmult_abdense = ab_dense;
264   } else {
265     ab_dense = c->matmult_abdense;
266   }
267 
268   /* clean old values in C */
269   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
270   /* Traverse A row-wise. */
271   /* Build the ith row in C by summing over nonzero columns in A, */
272   /* the rows of B corresponding to nonzeros of A. */
273   for (i=0; i<am; i++) {
274     anzi = ai[i+1] - ai[i];
275     for (j=0; j<anzi; j++) {
276       brow = aj[j];
277       bnzi = bi[brow+1] - bi[brow];
278       bjj  = bj + bi[brow];
279       baj  = ba + bi[brow];
280       /* perform dense axpy */
281       valtmp = aa[j];
282       for (k=0; k<bnzi; k++) {
283         ab_dense[bjj[k]] += valtmp*baj[k];
284       }
285       flops += 2*bnzi;
286     }
287     aj += anzi; aa += anzi;
288 
289     cnzi = ci[i+1] - ci[i];
290     for (k=0; k<cnzi; k++) {
291       ca[k]          += ab_dense[cj[k]];
292       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
293     }
294     flops += cnzi;
295     cj    += cnzi; ca += cnzi;
296   }
297   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
298   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
299   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
304 {
305   PetscErrorCode ierr;
306   PetscLogDouble flops=0.0;
307   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
308   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
309   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
310   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
311   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
312   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
313   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
314   PetscInt       nextb;
315 
316   PetscFunctionBegin;
317   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
318     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
319     c->a      = ca;
320     c->free_a = PETSC_TRUE;
321   }
322 
323   /* clean old values in C */
324   ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
325   /* Traverse A row-wise. */
326   /* Build the ith row in C by summing over nonzero columns in A, */
327   /* the rows of B corresponding to nonzeros of A. */
328   for (i=0; i<am; i++) {
329     anzi = ai[i+1] - ai[i];
330     cnzi = ci[i+1] - ci[i];
331     for (j=0; j<anzi; j++) {
332       brow = aj[j];
333       bnzi = bi[brow+1] - bi[brow];
334       bjj  = bj + bi[brow];
335       baj  = ba + bi[brow];
336       /* perform sparse axpy */
337       valtmp = aa[j];
338       nextb  = 0;
339       for (k=0; nextb<bnzi; k++) {
340         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
341           ca[k] += valtmp*baj[nextb++];
342         }
343       }
344       flops += 2*bnzi;
345     }
346     aj += anzi; aa += anzi;
347     cj += cnzi; ca += cnzi;
348   }
349 
350   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
351   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
352   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
357 {
358   PetscErrorCode     ierr;
359   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
360   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
361   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
362   MatScalar          *ca;
363   PetscReal          afill;
364   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
365   PetscTable         ta;
366   PetscFreeSpaceList free_space=NULL,current_space=NULL;
367 
368   PetscFunctionBegin;
369   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
370   /*-----------------------------------------------------------------------------------------*/
371   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
372   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
373   ci[0] = 0;
374 
375   /* create and initialize a linked list */
376   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
377   MatRowMergeMax_SeqAIJ(b,bm,ta);
378   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
379   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
380 
381   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
382 
383   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
384   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
385   current_space = free_space;
386 
387   /* Determine ci and cj */
388   for (i=0; i<am; i++) {
389     anzi = ai[i+1] - ai[i];
390     aj   = a->j + ai[i];
391     for (j=0; j<anzi; j++) {
392       brow = aj[j];
393       bnzj = bi[brow+1] - bi[brow];
394       bj   = b->j + bi[brow];
395       /* add non-zero cols of B into the sorted linked list lnk */
396       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
397     }
398     cnzi = lnk[1];
399 
400     /* If free space is not available, make more free space */
401     /* Double the amount of total space in the list */
402     if (current_space->local_remaining<cnzi) {
403       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
404       ndouble++;
405     }
406 
407     /* Copy data into free space, then initialize lnk */
408     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
409 
410     current_space->array           += cnzi;
411     current_space->local_used      += cnzi;
412     current_space->local_remaining -= cnzi;
413 
414     ci[i+1] = ci[i] + cnzi;
415   }
416 
417   /* Column indices are in the list of free space */
418   /* Allocate space for cj, initialize cj, and */
419   /* destroy list of free space and other temporary array(s) */
420   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
421   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
422   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
423 
424   /* Allocate space for ca */
425   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
426 
427   /* put together the new symbolic matrix */
428   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
429   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
430   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
431 
432   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
433   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
434   c          = (Mat_SeqAIJ*)(C->data);
435   c->free_a  = PETSC_TRUE;
436   c->free_ij = PETSC_TRUE;
437   c->nonew   = 0;
438 
439   /* slower, less memory */
440   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
441 
442   /* set MatInfo */
443   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
444   if (afill < 1.0) afill = 1.0;
445   c->maxnz                     = ci[am];
446   c->nz                        = ci[am];
447   C->info.mallocs           = ndouble;
448   C->info.fill_ratio_given  = fill;
449   C->info.fill_ratio_needed = afill;
450 
451 #if defined(PETSC_USE_INFO)
452   if (ci[am]) {
453     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
454     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
455   } else {
456     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
457   }
458 #endif
459   PetscFunctionReturn(0);
460 }
461 
462 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
463 {
464   PetscErrorCode     ierr;
465   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
466   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
467   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
468   MatScalar          *ca;
469   PetscReal          afill;
470   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
471   PetscTable         ta;
472   PetscFreeSpaceList free_space=NULL,current_space=NULL;
473 
474   PetscFunctionBegin;
475   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
476   /*---------------------------------------------------------------------------------------------*/
477   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
478   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
479   ci[0] = 0;
480 
481   /* create and initialize a linked list */
482   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
483   MatRowMergeMax_SeqAIJ(b,bm,ta);
484   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
485   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
486   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
487 
488   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
489   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
490   current_space = free_space;
491 
492   /* Determine ci and cj */
493   for (i=0; i<am; i++) {
494     anzi = ai[i+1] - ai[i];
495     aj   = a->j + ai[i];
496     for (j=0; j<anzi; j++) {
497       brow = aj[j];
498       bnzj = bi[brow+1] - bi[brow];
499       bj   = b->j + bi[brow];
500       /* add non-zero cols of B into the sorted linked list lnk */
501       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
502     }
503     cnzi = lnk[0];
504 
505     /* If free space is not available, make more free space */
506     /* Double the amount of total space in the list */
507     if (current_space->local_remaining<cnzi) {
508       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
509       ndouble++;
510     }
511 
512     /* Copy data into free space, then initialize lnk */
513     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
514 
515     current_space->array           += cnzi;
516     current_space->local_used      += cnzi;
517     current_space->local_remaining -= cnzi;
518 
519     ci[i+1] = ci[i] + cnzi;
520   }
521 
522   /* Column indices are in the list of free space */
523   /* Allocate space for cj, initialize cj, and */
524   /* destroy list of free space and other temporary array(s) */
525   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
526   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
527   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
528 
529   /* Allocate space for ca */
530   /*-----------------------*/
531   ierr = PetscCalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
532 
533   /* put together the new symbolic matrix */
534   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
535   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
536   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
537 
538   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
539   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
540   c          = (Mat_SeqAIJ*)(C->data);
541   c->free_a  = PETSC_TRUE;
542   c->free_ij = PETSC_TRUE;
543   c->nonew   = 0;
544 
545   /* slower, less memory */
546   C->ops->matmultnumeric    = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
547 
548   /* set MatInfo */
549   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
550   if (afill < 1.0) afill = 1.0;
551   c->maxnz                     = ci[am];
552   c->nz                        = ci[am];
553   C->info.mallocs           = ndouble;
554   C->info.fill_ratio_given  = fill;
555   C->info.fill_ratio_needed = afill;
556 
557 #if defined(PETSC_USE_INFO)
558   if (ci[am]) {
559     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
560     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
561   } else {
562     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
563   }
564 #endif
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
569 {
570   PetscErrorCode     ierr;
571   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
572   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
573   PetscInt           *ci,*cj,*bb;
574   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
575   PetscReal          afill;
576   PetscInt           i,j,col,ndouble = 0;
577   PetscFreeSpaceList free_space=NULL,current_space=NULL;
578   PetscHeap          h;
579 
580   PetscFunctionBegin;
581   /* Get ci and cj - by merging sorted rows using a heap */
582   /*---------------------------------------------------------------------------------------------*/
583   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
584   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
585   ci[0] = 0;
586 
587   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
588   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
589   current_space = free_space;
590 
591   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
592   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
593 
594   /* Determine ci and cj */
595   for (i=0; i<am; i++) {
596     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
597     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
598     ci[i+1] = ci[i];
599     /* Populate the min heap */
600     for (j=0; j<anzi; j++) {
601       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
602       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
603         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
604       }
605     }
606     /* Pick off the min element, adding it to free space */
607     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
608     while (j >= 0) {
609       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
610         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
611         ndouble++;
612       }
613       *(current_space->array++) = col;
614       current_space->local_used++;
615       current_space->local_remaining--;
616       ci[i+1]++;
617 
618       /* stash if anything else remains in this row of B */
619       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
620       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
621         PetscInt j2,col2;
622         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
623         if (col2 != col) break;
624         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
625         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
626       }
627       /* Put any stashed elements back into the min heap */
628       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
629       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
630     }
631   }
632   ierr = PetscFree(bb);CHKERRQ(ierr);
633   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
634 
635   /* Column indices are in the list of free space */
636   /* Allocate space for cj, initialize cj, and */
637   /* destroy list of free space and other temporary array(s) */
638   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
639   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
640 
641   /* put together the new symbolic matrix */
642   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
643   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
644   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
645 
646   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
647   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
648   c          = (Mat_SeqAIJ*)(C->data);
649   c->free_a  = PETSC_TRUE;
650   c->free_ij = PETSC_TRUE;
651   c->nonew   = 0;
652 
653   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
654 
655   /* set MatInfo */
656   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
657   if (afill < 1.0) afill = 1.0;
658   c->maxnz                     = ci[am];
659   c->nz                        = ci[am];
660   C->info.mallocs           = ndouble;
661   C->info.fill_ratio_given  = fill;
662   C->info.fill_ratio_needed = afill;
663 
664 #if defined(PETSC_USE_INFO)
665   if (ci[am]) {
666     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
667     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
668   } else {
669     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
670   }
671 #endif
672   PetscFunctionReturn(0);
673 }
674 
675 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
676 {
677   PetscErrorCode     ierr;
678   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
679   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
680   PetscInt           *ci,*cj,*bb;
681   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
682   PetscReal          afill;
683   PetscInt           i,j,col,ndouble = 0;
684   PetscFreeSpaceList free_space=NULL,current_space=NULL;
685   PetscHeap          h;
686   PetscBT            bt;
687 
688   PetscFunctionBegin;
689   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
690   /*---------------------------------------------------------------------------------------------*/
691   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
692   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
693   ci[0] = 0;
694 
695   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
696   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
697 
698   current_space = free_space;
699 
700   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
701   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
702   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
703 
704   /* Determine ci and cj */
705   for (i=0; i<am; i++) {
706     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
707     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
708     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
709     ci[i+1] = ci[i];
710     /* Populate the min heap */
711     for (j=0; j<anzi; j++) {
712       PetscInt brow = acol[j];
713       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
714         PetscInt bcol = bj[bb[j]];
715         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
716           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
717           bb[j]++;
718           break;
719         }
720       }
721     }
722     /* Pick off the min element, adding it to free space */
723     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
724     while (j >= 0) {
725       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
726         fptr = NULL;                      /* need PetscBTMemzero */
727         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
728         ndouble++;
729       }
730       *(current_space->array++) = col;
731       current_space->local_used++;
732       current_space->local_remaining--;
733       ci[i+1]++;
734 
735       /* stash if anything else remains in this row of B */
736       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
737         PetscInt bcol = bj[bb[j]];
738         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
739           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
740           bb[j]++;
741           break;
742         }
743       }
744       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
745     }
746     if (fptr) {                 /* Clear the bits for this row */
747       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
748     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
749       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
750     }
751   }
752   ierr = PetscFree(bb);CHKERRQ(ierr);
753   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
754   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
755 
756   /* Column indices are in the list of free space */
757   /* Allocate space for cj, initialize cj, and */
758   /* destroy list of free space and other temporary array(s) */
759   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
760   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
761 
762   /* put together the new symbolic matrix */
763   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
764   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
765   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
766 
767   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
768   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
769   c          = (Mat_SeqAIJ*)(C->data);
770   c->free_a  = PETSC_TRUE;
771   c->free_ij = PETSC_TRUE;
772   c->nonew   = 0;
773 
774   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
775 
776   /* set MatInfo */
777   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
778   if (afill < 1.0) afill = 1.0;
779   c->maxnz                     = ci[am];
780   c->nz                        = ci[am];
781   C->info.mallocs           = ndouble;
782   C->info.fill_ratio_given  = fill;
783   C->info.fill_ratio_needed = afill;
784 
785 #if defined(PETSC_USE_INFO)
786   if (ci[am]) {
787     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
788     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
789   } else {
790     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
791   }
792 #endif
793   PetscFunctionReturn(0);
794 }
795 
796 
797 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
798 {
799   PetscErrorCode     ierr;
800   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
801   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
802   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
803   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
804   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
805   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
806   const PetscInt     *brow_ptr[8],*brow_end[8];
807   PetscInt           window[8];
808   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
809   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
810   PetscReal          afill;
811   PetscInt           *workj_L1,*workj_L2,*workj_L3;
812   PetscInt           L1_nnz,L2_nnz;
813 
814   /* Step 1: Get upper bound on memory required for allocation.
815              Because of the way virtual memory works,
816              only the memory pages that are actually needed will be physically allocated. */
817   PetscFunctionBegin;
818   ierr = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
819   for (i=0; i<am; i++) {
820     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
821     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
822     a_rownnz = 0;
823     for (k=0; k<anzi; ++k) {
824       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
825       if (a_rownnz > bn) {
826         a_rownnz = bn;
827         break;
828       }
829     }
830     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
831   }
832   /* temporary work areas for merging rows */
833   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L1);CHKERRQ(ierr);
834   ierr = PetscMalloc1(a_maxrownnz*8,&workj_L2);CHKERRQ(ierr);
835   ierr = PetscMalloc1(a_maxrownnz,&workj_L3);CHKERRQ(ierr);
836 
837   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
838   c_maxmem = 8*(ai[am]+bi[bm]);
839   /* Step 2: Populate pattern for C */
840   ierr  = PetscMalloc1(c_maxmem,&cj);CHKERRQ(ierr);
841 
842   ci_nnz       = 0;
843   ci[0]        = 0;
844   worki_L1[0]  = 0;
845   worki_L2[0]  = 0;
846   for (i=0; i<am; i++) {
847     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
848     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
849     rowsleft             = anzi;
850     inputcol_L1          = acol;
851     L2_nnz               = 0;
852     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
853     worki_L2[1]          = 0;
854     outputi_nnz          = 0;
855 
856     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
857     while (ci_nnz+a_maxrownnz > c_maxmem) {
858       c_maxmem *= 2;
859       ndouble++;
860       ierr = PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);CHKERRQ(ierr);
861     }
862 
863     while (rowsleft) {
864       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
865       L1_nrows    = 0;
866       L1_nnz      = 0;
867       inputcol    = inputcol_L1;
868       inputi      = bi;
869       inputj      = bj;
870 
871       /* The following macro is used to specialize for small rows in A.
872          This helps with compiler unrolling, improving performance substantially.
873           Input:  inputj   inputi  inputcol  bn
874           Output: outputj  outputi_nnz                       */
875        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
876          window_min  = bn;                                                   \
877          outputi_nnz = 0;                                                    \
878          for (k=0; k<ANNZ; ++k) {                                            \
879            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
880            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
881            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
882            window_min  = PetscMin(window[k], window_min);                    \
883          }                                                                   \
884          while (window_min < bn) {                                           \
885            outputj[outputi_nnz++] = window_min;                              \
886            /* advance front and compute new minimum */                       \
887            old_window_min = window_min;                                      \
888            window_min = bn;                                                  \
889            for (k=0; k<ANNZ; ++k) {                                          \
890              if (window[k] == old_window_min) {                              \
891                brow_ptr[k]++;                                                \
892                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
893              }                                                               \
894              window_min = PetscMin(window[k], window_min);                   \
895            }                                                                 \
896          }
897 
898       /************** L E V E L  1 ***************/
899       /* Merge up to 8 rows of B to L1 work array*/
900       while (L1_rowsleft) {
901         outputi_nnz = 0;
902         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
903         else           outputj = cj + ci_nnz;           /* Merge directly to C */
904 
905         switch (L1_rowsleft) {
906         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
907                  brow_end[0] = inputj + inputi[inputcol[0]+1];
908                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
909                  inputcol    += L1_rowsleft;
910                  rowsleft    -= L1_rowsleft;
911                  L1_rowsleft  = 0;
912                  break;
913         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
914                  inputcol    += L1_rowsleft;
915                  rowsleft    -= L1_rowsleft;
916                  L1_rowsleft  = 0;
917                  break;
918         case 3: MatMatMultSymbolic_RowMergeMacro(3);
919                  inputcol    += L1_rowsleft;
920                  rowsleft    -= L1_rowsleft;
921                  L1_rowsleft  = 0;
922                  break;
923         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
924                  inputcol    += L1_rowsleft;
925                  rowsleft    -= L1_rowsleft;
926                  L1_rowsleft  = 0;
927                  break;
928         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
929                  inputcol    += L1_rowsleft;
930                  rowsleft    -= L1_rowsleft;
931                  L1_rowsleft  = 0;
932                  break;
933         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
934                  inputcol    += L1_rowsleft;
935                  rowsleft    -= L1_rowsleft;
936                  L1_rowsleft  = 0;
937                  break;
938         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
939                  inputcol    += L1_rowsleft;
940                  rowsleft    -= L1_rowsleft;
941                  L1_rowsleft  = 0;
942                  break;
943         default: MatMatMultSymbolic_RowMergeMacro(8);
944                  inputcol    += 8;
945                  rowsleft    -= 8;
946                  L1_rowsleft -= 8;
947                  break;
948         }
949         inputcol_L1           = inputcol;
950         L1_nnz               += outputi_nnz;
951         worki_L1[++L1_nrows]  = L1_nnz;
952       }
953 
954       /********************** L E V E L  2 ************************/
955       /* Merge from L1 work array to either C or to L2 work array */
956       if (anzi > 8) {
957         inputi      = worki_L1;
958         inputj      = workj_L1;
959         inputcol    = workcol;
960         outputi_nnz = 0;
961 
962         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
963         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
964 
965         switch (L1_nrows) {
966         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
967                  brow_end[0] = inputj + inputi[inputcol[0]+1];
968                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
969                  break;
970         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
971         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
972         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
973         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
974         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
975         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
976         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
977         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
978         }
979         L2_nnz               += outputi_nnz;
980         worki_L2[++L2_nrows]  = L2_nnz;
981 
982         /************************ L E V E L  3 **********************/
983         /* Merge from L2 work array to either C or to L2 work array */
984         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
985           inputi      = worki_L2;
986           inputj      = workj_L2;
987           inputcol    = workcol;
988           outputi_nnz = 0;
989           if (rowsleft) outputj = workj_L3;
990           else          outputj = cj + ci_nnz;
991           switch (L2_nrows) {
992           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
993                    brow_end[0] = inputj + inputi[inputcol[0]+1];
994                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
995                    break;
996           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
997           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
998           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
999           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1000           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1001           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1002           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1003           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1004           }
1005           L2_nrows    = 1;
1006           L2_nnz      = outputi_nnz;
1007           worki_L2[1] = outputi_nnz;
1008           /* Copy to workj_L2 */
1009           if (rowsleft) {
1010             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1011           }
1012         }
1013       }
1014     }  /* while (rowsleft) */
1015 #undef MatMatMultSymbolic_RowMergeMacro
1016 
1017     /* terminate current row */
1018     ci_nnz += outputi_nnz;
1019     ci[i+1] = ci_nnz;
1020   }
1021 
1022   /* Step 3: Create the new symbolic matrix */
1023   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1024   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1025   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1026 
1027   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1028   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1029   c          = (Mat_SeqAIJ*)(C->data);
1030   c->free_a  = PETSC_TRUE;
1031   c->free_ij = PETSC_TRUE;
1032   c->nonew   = 0;
1033 
1034   C->ops->matmultnumeric        = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1035 
1036   /* set MatInfo */
1037   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1038   if (afill < 1.0) afill = 1.0;
1039   c->maxnz                     = ci[am];
1040   c->nz                        = ci[am];
1041   C->info.mallocs           = ndouble;
1042   C->info.fill_ratio_given  = fill;
1043   C->info.fill_ratio_needed = afill;
1044 
1045 #if defined(PETSC_USE_INFO)
1046   if (ci[am]) {
1047     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1048     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1049   } else {
1050     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1051   }
1052 #endif
1053 
1054   /* Step 4: Free temporary work areas */
1055   ierr = PetscFree(workj_L1);CHKERRQ(ierr);
1056   ierr = PetscFree(workj_L2);CHKERRQ(ierr);
1057   ierr = PetscFree(workj_L3);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 /* concatenate unique entries and then sort */
1062 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1063 {
1064   PetscErrorCode     ierr;
1065   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1066   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1067   PetscInt           *ci,*cj;
1068   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1069   PetscReal          afill;
1070   PetscInt           i,j,ndouble = 0;
1071   PetscSegBuffer     seg,segrow;
1072   char               *seen;
1073 
1074   PetscFunctionBegin;
1075   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
1076   ci[0] = 0;
1077 
1078   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1079   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
1080   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
1081   ierr = PetscCalloc1(bn,&seen);CHKERRQ(ierr);
1082 
1083   /* Determine ci and cj */
1084   for (i=0; i<am; i++) {
1085     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1086     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1087     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1088     /* Pack segrow */
1089     for (j=0; j<anzi; j++) {
1090       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1091       for (k=bjstart; k<bjend; k++) {
1092         PetscInt bcol = bj[k];
1093         if (!seen[bcol]) { /* new entry */
1094           PetscInt *PETSC_RESTRICT slot;
1095           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
1096           *slot = bcol;
1097           seen[bcol] = 1;
1098           packlen++;
1099         }
1100       }
1101     }
1102     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
1103     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
1104     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
1105     ci[i+1] = ci[i] + packlen;
1106     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1107   }
1108   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
1109   ierr = PetscFree(seen);CHKERRQ(ierr);
1110 
1111   /* Column indices are in the segmented buffer */
1112   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
1113   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
1114 
1115   /* put together the new symbolic matrix */
1116   ierr = MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
1117   ierr = MatSetBlockSizesFromMats(C,A,B);CHKERRQ(ierr);
1118   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1119 
1120   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1121   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1122   c          = (Mat_SeqAIJ*)(C->data);
1123   c->free_a  = PETSC_TRUE;
1124   c->free_ij = PETSC_TRUE;
1125   c->nonew   = 0;
1126 
1127   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1128 
1129   /* set MatInfo */
1130   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1131   if (afill < 1.0) afill = 1.0;
1132   c->maxnz                     = ci[am];
1133   c->nz                        = ci[am];
1134   C->info.mallocs           = ndouble;
1135   C->info.fill_ratio_given  = fill;
1136   C->info.fill_ratio_needed = afill;
1137 
1138 #if defined(PETSC_USE_INFO)
1139   if (ci[am]) {
1140     ierr = PetscInfo3(C,"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
1141     ierr = PetscInfo1(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
1142   } else {
1143     ierr = PetscInfo(C,"Empty matrix product\n");CHKERRQ(ierr);
1144   }
1145 #endif
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
1150 {
1151   PetscErrorCode      ierr;
1152   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
1153   Mat_MatMatTransMult *abt=a->abt;
1154 
1155   PetscFunctionBegin;
1156   ierr = (abt->destroy)(A);CHKERRQ(ierr);
1157   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
1158   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
1159   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
1160   ierr = PetscFree(abt);CHKERRQ(ierr);
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1165 {
1166   PetscErrorCode      ierr;
1167   Mat                 Bt;
1168   PetscInt            *bti,*btj;
1169   Mat_MatMatTransMult *abt;
1170   Mat_SeqAIJ          *c;
1171   Mat_Product         *product = C->product;
1172   MatProductAlgorithm alg = product->alg;
1173 
1174   PetscFunctionBegin;
1175   /* create symbolic Bt */
1176   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1177   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
1178   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1179   ierr = MatSetType(Bt,((PetscObject)A)->type_name);CHKERRQ(ierr);
1180 
1181   /* get symbolic C=A*Bt */
1182   ierr = MatProductSetAlgorithm(C,"sorted");CHKERRQ(ierr); /* set algorithm for C = A*Bt */
1183   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
1184   ierr = MatProductSetAlgorithm(C,alg);CHKERRQ(ierr); /* resume original algorithm for ABt product */
1185 
1186   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1187   ierr   = PetscNew(&abt);CHKERRQ(ierr);
1188   c      = (Mat_SeqAIJ*)C->data;
1189   c->abt = abt;
1190 
1191   abt->usecoloring = PETSC_FALSE;
1192   abt->destroy     = C->ops->destroy;
1193   C->ops->destroy  = MatDestroy_SeqAIJ_MatMatMultTrans;
1194   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1195 
1196   abt->usecoloring = PETSC_FALSE;
1197   ierr = PetscStrcmp(product->alg,"color",&abt->usecoloring);CHKERRQ(ierr);
1198   if (abt->usecoloring) {
1199     /* Create MatTransposeColoring from symbolic C=A*B^T */
1200     MatTransposeColoring matcoloring;
1201     MatColoring          coloring;
1202     ISColoring           iscoloring;
1203     Mat                  Bt_dense,C_dense;
1204 
1205     /* inode causes memory problem */
1206     ierr = MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);CHKERRQ(ierr);
1207 
1208     ierr = MatColoringCreate(C,&coloring);CHKERRQ(ierr);
1209     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
1210     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
1211     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
1212     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
1213     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
1214     ierr = MatTransposeColoringCreate(C,iscoloring,&matcoloring);CHKERRQ(ierr);
1215 
1216     abt->matcoloring = matcoloring;
1217 
1218     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
1219 
1220     /* Create Bt_dense and C_dense = A*Bt_dense */
1221     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
1222     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1223     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
1224     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
1225 
1226     Bt_dense->assembled = PETSC_TRUE;
1227     abt->Bt_den         = Bt_dense;
1228 
1229     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
1230     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
1231     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
1232     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
1233 
1234     Bt_dense->assembled = PETSC_TRUE;
1235     abt->ABt_den  = C_dense;
1236 
1237 #if defined(PETSC_USE_INFO)
1238     {
1239       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1240       ierr = PetscInfo7(C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
1241     }
1242 #endif
1243   }
1244   /* clean up */
1245   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
1246   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1251 {
1252   PetscErrorCode      ierr;
1253   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1254   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1255   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1256   PetscLogDouble      flops=0.0;
1257   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1258   Mat_MatMatTransMult *abt = c->abt;
1259 
1260   PetscFunctionBegin;
1261   /* clear old values in C */
1262   if (!c->a) {
1263     ierr      = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1264     c->a      = ca;
1265     c->free_a = PETSC_TRUE;
1266   } else {
1267     ca =  c->a;
1268     ierr = PetscArrayzero(ca,ci[cm]+1);CHKERRQ(ierr);
1269   }
1270 
1271   if (abt->usecoloring) {
1272     MatTransposeColoring matcoloring = abt->matcoloring;
1273     Mat                  Bt_dense,C_dense = abt->ABt_den;
1274 
1275     /* Get Bt_dense by Apply MatTransposeColoring to B */
1276     Bt_dense = abt->Bt_den;
1277     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1278 
1279     /* C_dense = A*Bt_dense */
1280     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1281 
1282     /* Recover C from C_dense */
1283     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1284     PetscFunctionReturn(0);
1285   }
1286 
1287   for (i=0; i<cm; i++) {
1288     anzi = ai[i+1] - ai[i];
1289     acol = aj + ai[i];
1290     aval = aa + ai[i];
1291     cnzi = ci[i+1] - ci[i];
1292     ccol = cj + ci[i];
1293     cval = ca + ci[i];
1294     for (j=0; j<cnzi; j++) {
1295       brow = ccol[j];
1296       bnzj = bi[brow+1] - bi[brow];
1297       bcol = bj + bi[brow];
1298       bval = ba + bi[brow];
1299 
1300       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1301       nexta = 0; nextb = 0;
1302       while (nexta<anzi && nextb<bnzj) {
1303         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1304         if (nexta == anzi) break;
1305         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1306         if (nextb == bnzj) break;
1307         if (acol[nexta] == bcol[nextb]) {
1308           cval[j] += aval[nexta]*bval[nextb];
1309           nexta++; nextb++;
1310           flops += 2;
1311         }
1312       }
1313     }
1314   }
1315   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1316   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1317   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1318   PetscFunctionReturn(0);
1319 }
1320 
1321 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
1322 {
1323   PetscErrorCode      ierr;
1324   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
1325   Mat_MatTransMatMult *atb = a->atb;
1326 
1327   PetscFunctionBegin;
1328   if (atb) {
1329     ierr = MatDestroy(&atb->At);CHKERRQ(ierr);
1330     ierr = (*atb->destroy)(A);CHKERRQ(ierr);
1331   }
1332   ierr = PetscFree(atb);CHKERRQ(ierr);
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1337 {
1338   PetscErrorCode      ierr;
1339   Mat                 At;
1340   PetscInt            *ati,*atj;
1341   Mat_Product         *product = C->product;
1342   MatProductAlgorithm alg;
1343   PetscBool           flg;
1344 
1345   PetscFunctionBegin;
1346   if (product) {
1347     alg = product->alg;
1348   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"!product, not supported yet");
1349 
1350   /* outerproduct */
1351   ierr = PetscStrcmp(alg,"outerproduct",&flg);CHKERRQ(ierr);
1352   if (flg) {
1353     /* create symbolic At */
1354     ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1355     ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1356     ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1357     ierr = MatSetType(At,((PetscObject)A)->type_name);CHKERRQ(ierr);
1358 
1359     /* get symbolic C=At*B */
1360     product->alg = "sorted";
1361     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1362 
1363     /* clean up */
1364     ierr = MatDestroy(&At);CHKERRQ(ierr);
1365     ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1366 
1367     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1368     PetscFunctionReturn(0);
1369   }
1370 
1371   /* matmatmult */
1372   ierr = PetscStrcmp(alg,"at*b",&flg);CHKERRQ(ierr);
1373   if (flg) {
1374     Mat_MatTransMatMult *atb;
1375     Mat_SeqAIJ          *c;
1376 
1377     ierr = PetscNew(&atb);CHKERRQ(ierr);
1378     ierr = MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);CHKERRQ(ierr);
1379     product->alg = "sorted";
1380     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1381 
1382     c               = (Mat_SeqAIJ*)C->data;
1383     c->atb          = atb;
1384     atb->At         = At;
1385     atb->destroy    = C->ops->destroy;
1386     atb->updateAt   = PETSC_FALSE; /* because At is computed here */
1387     C->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1388 
1389     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1390     PetscFunctionReturn(0);
1391   }
1392 
1393   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1398 {
1399   PetscErrorCode ierr;
1400   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1401   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1402   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1403   PetscLogDouble flops=0.0;
1404   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1405 
1406   PetscFunctionBegin;
1407   if (!c->a) {
1408     ierr = PetscCalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1409 
1410     c->a      = ca;
1411     c->free_a = PETSC_TRUE;
1412   } else {
1413     ca   = c->a;
1414     ierr = PetscArrayzero(ca,ci[cm]);CHKERRQ(ierr);
1415   }
1416 
1417   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1418   for (i=0; i<am; i++) {
1419     bj   = b->j + bi[i];
1420     ba   = b->a + bi[i];
1421     bnzi = bi[i+1] - bi[i];
1422     anzi = ai[i+1] - ai[i];
1423     for (j=0; j<anzi; j++) {
1424       nextb = 0;
1425       crow  = *aj++;
1426       cjj   = cj + ci[crow];
1427       caj   = ca + ci[crow];
1428       /* perform sparse axpy operation.  Note cjj includes bj. */
1429       for (k=0; nextb<bnzi; k++) {
1430         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1431           caj[k] += (*aa)*(*(ba+nextb));
1432           nextb++;
1433         }
1434       }
1435       flops += 2*bnzi;
1436       aa++;
1437     }
1438   }
1439 
1440   /* Assemble the final matrix and clean up */
1441   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1442   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1443   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1448 {
1449   PetscErrorCode ierr;
1450 
1451   PetscFunctionBegin;
1452   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1453 
1454   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1459 {
1460   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1461   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1462   PetscErrorCode    ierr;
1463   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1464   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1465   const PetscInt    *aj;
1466   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1467   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1468 
1469   PetscFunctionBegin;
1470   if (!cm || !cn) PetscFunctionReturn(0);
1471   ierr = MatSeqAIJGetArrayRead(A,&av);CHKERRQ(ierr);
1472   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1473   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1474   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1475   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1476   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1477     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1478       r1 = r2 = r3 = r4 = 0.0;
1479       n  = a->i[i+1] - a->i[i];
1480       aj = a->j + a->i[i];
1481       aa = av + a->i[i];
1482       for (j=0; j<n; j++) {
1483         aatmp = aa[j]; ajtmp = aj[j];
1484         r1 += aatmp*b1[ajtmp];
1485         r2 += aatmp*b2[ajtmp];
1486         r3 += aatmp*b3[ajtmp];
1487         r4 += aatmp*b4[ajtmp];
1488       }
1489       c1[i] += r1;
1490       c2[i] += r2;
1491       c3[i] += r3;
1492       c4[i] += r4;
1493     }
1494     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1495     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1496   }
1497   for (; col<cn; col++) {   /* over extra columns of C */
1498     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1499       r1 = 0.0;
1500       n  = a->i[i+1] - a->i[i];
1501       aj = a->j + a->i[i];
1502       aa = av + a->i[i];
1503       for (j=0; j<n; j++) {
1504         r1 += aa[j]*b1[aj[j]];
1505       }
1506       c1[i] += r1;
1507     }
1508     b1 += bm;
1509     c1 += am;
1510   }
1511   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1512   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1513   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
1514   ierr = MatSeqAIJRestoreArrayRead(A,&av);CHKERRQ(ierr);
1515   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1516   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1521 {
1522   PetscErrorCode ierr;
1523 
1524   PetscFunctionBegin;
1525   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1526   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1527   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1528 
1529   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1530   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);CHKERRQ(ierr);
1531   PetscFunctionReturn(0);
1532 }
1533 
1534 /* ------------------------------------------------------- */
1535 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1536 {
1537   PetscFunctionBegin;
1538   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1539   C->ops->productsymbolic = MatProductSymbolic_AB;
1540   /* dense mat may not call MatProductSymbolic(), thus set C->ops->productnumeric here */
1541   C->ops->productnumeric  = MatProductNumeric_AB;
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1546 {
1547   PetscFunctionBegin;
1548   C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqAIJ_SeqDense;
1549   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1550   C->ops->productnumeric           = MatProductNumeric_AtB;
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1555 {
1556   PetscErrorCode ierr;
1557   Mat_Product    *product = C->product;
1558 
1559   PetscFunctionBegin;
1560   switch (product->type) {
1561   case MATPRODUCT_AB:
1562     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1563     break;
1564   case MATPRODUCT_AtB:
1565     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);CHKERRQ(ierr);
1566     break;
1567   case MATPRODUCT_PtAP:
1568     ierr = MatProductSetFromOptions_SeqDense(C);CHKERRQ(ierr);
1569     break;
1570   default:
1571     /* Use MatProduct_Basic() if there is no specific implementation */
1572     C->ops->productsymbolic = MatProductSymbolic_Basic;
1573   }
1574   PetscFunctionReturn(0);
1575 }
1576 /* ------------------------------------------------------- */
1577 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1578 {
1579   PetscErrorCode ierr;
1580   Mat_Product    *product = C->product;
1581   Mat            A = product->A;
1582   PetscBool      baij;
1583 
1584   PetscFunctionBegin;
1585   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);CHKERRQ(ierr);
1586   if (!baij) { /* A is seqsbaij */
1587     PetscBool sbaij;
1588     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);CHKERRQ(ierr);
1589     if (!sbaij) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1590 
1591     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1592   } else { /* A is seqbaij */
1593     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1594   }
1595 
1596   C->ops->productsymbolic = MatProductSymbolic_AB;
1597   C->ops->productnumeric  = MatProductNumeric_AB;
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1602 {
1603   PetscErrorCode ierr;
1604   Mat_Product    *product = C->product;
1605 
1606   PetscFunctionBegin;
1607   if (product->type == MATPRODUCT_AB) {
1608     ierr = MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);CHKERRQ(ierr);
1609   } else SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type is not supported");
1610   PetscFunctionReturn(0);
1611 }
1612 /* ------------------------------------------------------- */
1613 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1614 {
1615   PetscFunctionBegin;
1616   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1617   C->ops->productsymbolic = MatProductSymbolic_AB;
1618   C->ops->productnumeric  = MatProductNumeric_AB;
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1623 {
1624   PetscErrorCode ierr;
1625   Mat_Product    *product = C->product;
1626 
1627   PetscFunctionBegin;
1628   if (product->type == MATPRODUCT_AB) {
1629     ierr = MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);CHKERRQ(ierr);
1630   } else SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"MatProduct type is not supported");
1631   PetscFunctionReturn(0);
1632 }
1633 /* ------------------------------------------------------- */
1634 
1635 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1636 {
1637   PetscErrorCode ierr;
1638   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1639   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1640   PetscInt       *bi      = b->i,*bj=b->j;
1641   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1642   MatScalar      *btval,*btval_den,*ba=b->a;
1643   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1644 
1645   PetscFunctionBegin;
1646   btval_den=btdense->v;
1647   ierr     = PetscArrayzero(btval_den,m*n);CHKERRQ(ierr);
1648   for (k=0; k<ncolors; k++) {
1649     ncolumns = coloring->ncolumns[k];
1650     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1651       col   = *(columns + colorforcol[k] + l);
1652       btcol = bj + bi[col];
1653       btval = ba + bi[col];
1654       anz   = bi[col+1] - bi[col];
1655       for (j=0; j<anz; j++) {
1656         brow            = btcol[j];
1657         btval_den[brow] = btval[j];
1658       }
1659     }
1660     btval_den += m;
1661   }
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1666 {
1667   PetscErrorCode    ierr;
1668   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1669   const PetscScalar *ca_den,*ca_den_ptr;
1670   PetscScalar       *ca=csp->a;
1671   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1672   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1673   PetscInt          nrows,*row,*idx;
1674   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1675 
1676   PetscFunctionBegin;
1677   ierr   = MatDenseGetArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1678 
1679   if (brows > 0) {
1680     PetscInt *lstart,row_end,row_start;
1681     lstart = matcoloring->lstart;
1682     ierr = PetscArrayzero(lstart,ncolors);CHKERRQ(ierr);
1683 
1684     row_end = brows;
1685     if (row_end > m) row_end = m;
1686     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1687       ca_den_ptr = ca_den;
1688       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1689         nrows = matcoloring->nrows[k];
1690         row   = rows  + colorforrow[k];
1691         idx   = den2sp + colorforrow[k];
1692         for (l=lstart[k]; l<nrows; l++) {
1693           if (row[l] >= row_end) {
1694             lstart[k] = l;
1695             break;
1696           } else {
1697             ca[idx[l]] = ca_den_ptr[row[l]];
1698           }
1699         }
1700         ca_den_ptr += m;
1701       }
1702       row_end += brows;
1703       if (row_end > m) row_end = m;
1704     }
1705   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1706     ca_den_ptr = ca_den;
1707     for (k=0; k<ncolors; k++) {
1708       nrows = matcoloring->nrows[k];
1709       row   = rows  + colorforrow[k];
1710       idx   = den2sp + colorforrow[k];
1711       for (l=0; l<nrows; l++) {
1712         ca[idx[l]] = ca_den_ptr[row[l]];
1713       }
1714       ca_den_ptr += m;
1715     }
1716   }
1717 
1718   ierr = MatDenseRestoreArrayRead(Cden,&ca_den);CHKERRQ(ierr);
1719 #if defined(PETSC_USE_INFO)
1720   if (matcoloring->brows > 0) {
1721     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1722   } else {
1723     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1724   }
1725 #endif
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1730 {
1731   PetscErrorCode ierr;
1732   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1733   const PetscInt *is,*ci,*cj,*row_idx;
1734   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1735   IS             *isa;
1736   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1737   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1738   PetscInt       *colorforcol,*columns,*columns_i,brows;
1739   PetscBool      flg;
1740 
1741   PetscFunctionBegin;
1742   ierr = ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1743 
1744   /* bs >1 is not being tested yet! */
1745   Nbs       = mat->cmap->N/bs;
1746   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1747   c->N      = Nbs;
1748   c->m      = c->M;
1749   c->rstart = 0;
1750   c->brows  = 100;
1751 
1752   c->ncolors = nis;
1753   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1754   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1755   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1756 
1757   brows = c->brows;
1758   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1759   if (flg) c->brows = brows;
1760   if (brows > 0) {
1761     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1762   }
1763 
1764   colorforrow[0] = 0;
1765   rows_i         = rows;
1766   den2sp_i       = den2sp;
1767 
1768   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1769   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
1770 
1771   colorforcol[0] = 0;
1772   columns_i      = columns;
1773 
1774   /* get column-wise storage of mat */
1775   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1776 
1777   cm   = c->m;
1778   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1779   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1780   for (i=0; i<nis; i++) { /* loop over color */
1781     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1782     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1783 
1784     c->ncolumns[i] = n;
1785     if (n) {
1786       ierr = PetscArraycpy(columns_i,is,n);CHKERRQ(ierr);
1787     }
1788     colorforcol[i+1] = colorforcol[i] + n;
1789     columns_i       += n;
1790 
1791     /* fast, crude version requires O(N*N) work */
1792     ierr = PetscArrayzero(rowhit,cm);CHKERRQ(ierr);
1793 
1794     for (j=0; j<n; j++) { /* loop over columns*/
1795       col     = is[j];
1796       row_idx = cj + ci[col];
1797       m       = ci[col+1] - ci[col];
1798       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1799         idxhit[*row_idx]   = spidx[ci[col] + k];
1800         rowhit[*row_idx++] = col + 1;
1801       }
1802     }
1803     /* count the number of hits */
1804     nrows = 0;
1805     for (j=0; j<cm; j++) {
1806       if (rowhit[j]) nrows++;
1807     }
1808     c->nrows[i]      = nrows;
1809     colorforrow[i+1] = colorforrow[i] + nrows;
1810 
1811     nrows = 0;
1812     for (j=0; j<cm; j++) { /* loop over rows */
1813       if (rowhit[j]) {
1814         rows_i[nrows]   = j;
1815         den2sp_i[nrows] = idxhit[j];
1816         nrows++;
1817       }
1818     }
1819     den2sp_i += nrows;
1820 
1821     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1822     rows_i += nrows;
1823   }
1824   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1825   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1826   ierr = ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);CHKERRQ(ierr);
1827   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1828 
1829   c->colorforrow = colorforrow;
1830   c->rows        = rows;
1831   c->den2sp      = den2sp;
1832   c->colorforcol = colorforcol;
1833   c->columns     = columns;
1834 
1835   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 /* --------------------------------------------------------------- */
1840 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1841 {
1842   PetscErrorCode ierr;
1843   Mat_Product    *product = C->product;
1844   Mat            A=product->A,B=product->B;
1845 
1846   PetscFunctionBegin;
1847   if (C->ops->mattransposemultnumeric) {
1848     /* Alg: "outerproduct" */
1849     ierr = (C->ops->mattransposemultnumeric)(A,B,C);CHKERRQ(ierr);
1850   } else {
1851     /* Alg: "matmatmult" -- C = At*B */
1852     Mat_SeqAIJ          *c = (Mat_SeqAIJ*)C->data;
1853     Mat_MatTransMatMult *atb = c->atb;
1854     Mat                 At = atb->At;
1855 
1856     if (atb->updateAt) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1857       ierr = MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);CHKERRQ(ierr);
1858     }
1859     ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ(At,B,C);CHKERRQ(ierr);
1860     atb->updateAt = PETSC_TRUE;
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1866 {
1867   PetscErrorCode ierr;
1868   Mat_Product    *product = C->product;
1869   Mat            A=product->A,B=product->B;
1870   PetscReal      fill=product->fill;
1871 
1872   PetscFunctionBegin;
1873   ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1874 
1875   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /* --------------------------------------------------------------- */
1880 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1881 {
1882   PetscErrorCode ierr;
1883   Mat_Product    *product = C->product;
1884   PetscInt       alg = 0; /* default algorithm */
1885   PetscBool      flg = PETSC_FALSE;
1886 #if !defined(PETSC_HAVE_HYPRE)
1887   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1888   PetscInt       nalg = 7;
1889 #else
1890   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1891   PetscInt       nalg = 8;
1892 #endif
1893 
1894   PetscFunctionBegin;
1895   /* Set default algorithm */
1896   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1897   if (flg) {
1898     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1899   }
1900 
1901   /* Get runtime option */
1902   if (product->api_user) {
1903     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1904     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1905     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1906   } else {
1907     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1908     ierr = PetscOptionsEList("-matproduct_ab_via","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
1909     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1910   }
1911   if (flg) {
1912     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1913   }
1914 
1915   C->ops->productsymbolic = MatProductSymbolic_AB;
1916   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1917   PetscFunctionReturn(0);
1918 }
1919 
1920 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1921 {
1922   PetscErrorCode ierr;
1923   Mat_Product    *product = C->product;
1924   PetscInt       alg = 0; /* default algorithm */
1925   PetscBool      flg = PETSC_FALSE;
1926   const char     *algTypes[2] = {"at*b","outerproduct"};
1927   PetscInt       nalg = 2;
1928 
1929   PetscFunctionBegin;
1930   /* Set default algorithm */
1931   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
1932   if (flg) {
1933     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1934   }
1935 
1936   /* Get runtime option */
1937   if (product->api_user) {
1938     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1939     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1940     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1941   } else {
1942     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1943     ierr = PetscOptionsEList("-matproduct_atb_via","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1944     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1945   }
1946   if (flg) {
1947     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1948   }
1949 
1950   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
1951   PetscFunctionReturn(0);
1952 }
1953 
1954 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
1955 {
1956   PetscErrorCode ierr;
1957   Mat_Product    *product = C->product;
1958   PetscInt       alg = 0; /* default algorithm */
1959   PetscBool      flg = PETSC_FALSE;
1960   const char     *algTypes[2] = {"default","color"};
1961   PetscInt       nalg = 2;
1962 
1963   PetscFunctionBegin;
1964   /* Set default algorithm */
1965   ierr = PetscStrcmp(C->product->alg,"default",&flg);CHKERRQ(ierr);
1966   if (!flg) {
1967     alg = 1;
1968     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1969   }
1970 
1971   /* Get runtime option */
1972   if (product->api_user) {
1973     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");CHKERRQ(ierr);
1974     ierr = PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1975     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1976   } else {
1977     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");CHKERRQ(ierr);
1978     ierr = PetscOptionsEList("-matproduct_abt_via","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1979     ierr = PetscOptionsEnd();CHKERRQ(ierr);
1980   }
1981   if (flg) {
1982     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
1983   }
1984 
1985   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
1986   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1987   PetscFunctionReturn(0);
1988 }
1989 
1990 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
1991 {
1992   PetscErrorCode ierr;
1993   Mat_Product    *product = C->product;
1994   PetscBool      flg = PETSC_FALSE;
1995   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
1996 #if !defined(PETSC_HAVE_HYPRE)
1997   const char      *algTypes[2] = {"scalable","rap"};
1998   PetscInt        nalg = 2;
1999 #else
2000   const char      *algTypes[3] = {"scalable","rap","hypre"};
2001   PetscInt        nalg = 3;
2002 #endif
2003 
2004   PetscFunctionBegin;
2005   /* Set default algorithm */
2006   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2007   if (flg) {
2008     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2009   }
2010 
2011   /* Get runtime option */
2012   if (product->api_user) {
2013     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2014     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2015     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2016   } else {
2017     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2018     ierr = PetscOptionsEList("-matproduct_ptap_via","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2019     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2020   }
2021   if (flg) {
2022     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2023   }
2024 
2025   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2026   PetscFunctionReturn(0);
2027 }
2028 
2029 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2030 {
2031   PetscErrorCode ierr;
2032   Mat_Product    *product = C->product;
2033   PetscBool      flg = PETSC_FALSE;
2034   PetscInt       alg = 0; /* default algorithm */
2035   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2036   PetscInt        nalg = 3;
2037 
2038   PetscFunctionBegin;
2039   /* Set default algorithm */
2040   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2041   if (flg) {
2042     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2043   }
2044 
2045   /* Get runtime option */
2046   if (product->api_user) {
2047     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2048     ierr = PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2049     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2050   } else {
2051     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2052     ierr = PetscOptionsEList("-matproduct_rart_via","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);CHKERRQ(ierr);
2053     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2054   }
2055   if (flg) {
2056     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2057   }
2058 
2059   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2064 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2065 {
2066   PetscErrorCode ierr;
2067   Mat_Product    *product = C->product;
2068   PetscInt       alg = 0; /* default algorithm */
2069   PetscBool      flg = PETSC_FALSE;
2070   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2071   PetscInt       nalg = 7;
2072 
2073   PetscFunctionBegin;
2074   /* Set default algorithm */
2075   ierr = PetscStrcmp(product->alg,"default",&flg);CHKERRQ(ierr);
2076   if (flg) {
2077     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2078   }
2079 
2080   /* Get runtime option */
2081   if (product->api_user) {
2082     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2083     ierr = PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2084     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2085   } else {
2086     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2087     ierr = PetscOptionsEList("-matproduct_abc_via","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
2088     ierr = PetscOptionsEnd();CHKERRQ(ierr);
2089   }
2090   if (flg) {
2091     ierr = MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);CHKERRQ(ierr);
2092   }
2093 
2094   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2095   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2096   PetscFunctionReturn(0);
2097 }
2098 
2099 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2100 {
2101   PetscErrorCode ierr;
2102   Mat_Product    *product = C->product;
2103 
2104   PetscFunctionBegin;
2105   switch (product->type) {
2106   case MATPRODUCT_AB:
2107     ierr = MatProductSetFromOptions_SeqAIJ_AB(C);CHKERRQ(ierr);
2108     break;
2109   case MATPRODUCT_AtB:
2110     ierr = MatProductSetFromOptions_SeqAIJ_AtB(C);CHKERRQ(ierr);
2111     break;
2112   case MATPRODUCT_ABt:
2113     ierr = MatProductSetFromOptions_SeqAIJ_ABt(C);CHKERRQ(ierr);
2114     break;
2115   case MATPRODUCT_PtAP:
2116     ierr = MatProductSetFromOptions_SeqAIJ_PtAP(C);CHKERRQ(ierr);
2117     break;
2118   case MATPRODUCT_RARt:
2119     ierr = MatProductSetFromOptions_SeqAIJ_RARt(C);CHKERRQ(ierr);
2120     break;
2121   case MATPRODUCT_ABC:
2122     ierr = MatProductSetFromOptions_SeqAIJ_ABC(C);CHKERRQ(ierr);
2123     break;
2124   default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type is not supported");
2125   }
2126   PetscFunctionReturn(0);
2127 }
2128