xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 2bf68e3e0f2a61f71e7c65bee250bfa1c8ce0cdb)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <petsc/private/isimpl.h>
12 #include <../src/mat/impls/dense/seq/dense.h>
13 
14 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
15 
16 #if defined(PETSC_HAVE_HYPRE)
17 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
18 #endif
19 
20 #undef __FUNCT__
21 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
22 PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
23 {
24   PetscErrorCode ierr;
25 #if !defined(PETSC_HAVE_HYPRE)
26   const char     *algTypes[6] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed"};
27   PetscInt       nalg = 6;
28 #else
29   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","hypre"};
30   PetscInt       nalg = 7;
31 #endif
32   PetscInt       alg = 0; /* set default algorithm */
33 
34   PetscFunctionBegin;
35   if (scall == MAT_INITIAL_MATRIX) {
36     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
37     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);CHKERRQ(ierr);
38     ierr = PetscOptionsEnd();CHKERRQ(ierr);
39     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
40     switch (alg) {
41     case 1:
42       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
43       break;
44     case 2:
45       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
46       break;
47     case 3:
48       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
49       break;
50     case 4:
51       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
52       break;
53     case 5:
54       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
55       break;
56 #if defined(PETSC_HAVE_HYPRE)
57     case 6:
58       ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
59       break;
60 #endif
61     default:
62       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
63      break;
64     }
65     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
66   }
67 
68   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
69   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 #undef __FUNCT__
75 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed"
76 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
77 {
78   PetscErrorCode     ierr;
79   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
80   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
81   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
82   PetscReal          afill;
83   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
84   PetscTable         ta;
85   PetscBT            lnkbt;
86   PetscFreeSpaceList free_space=NULL,current_space=NULL;
87 
88   PetscFunctionBegin;
89   /* Get ci and cj */
90   /*---------------*/
91   /* Allocate ci array, arrays for fill computation and */
92   /* free space for accumulating nonzero column info */
93   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
94   ci[0] = 0;
95 
96   /* create and initialize a linked list */
97   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
98   MatRowMergeMax_SeqAIJ(b,bm,ta);
99   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
100   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
101 
102   ierr = PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);CHKERRQ(ierr);
103 
104   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
105   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
106 
107   current_space = free_space;
108 
109   /* Determine ci and cj */
110   for (i=0; i<am; i++) {
111     anzi = ai[i+1] - ai[i];
112     aj   = a->j + ai[i];
113     for (j=0; j<anzi; j++) {
114       brow = aj[j];
115       bnzj = bi[brow+1] - bi[brow];
116       bj   = b->j + bi[brow];
117       /* add non-zero cols of B into the sorted linked list lnk */
118       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
119     }
120     cnzi = lnk[0];
121 
122     /* If free space is not available, make more free space */
123     /* Double the amount of total space in the list */
124     if (current_space->local_remaining<cnzi) {
125       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
126       ndouble++;
127     }
128 
129     /* Copy data into free space, then initialize lnk */
130     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
131 
132     current_space->array           += cnzi;
133     current_space->local_used      += cnzi;
134     current_space->local_remaining -= cnzi;
135 
136     ci[i+1] = ci[i] + cnzi;
137   }
138 
139   /* Column indices are in the list of free space */
140   /* Allocate space for cj, initialize cj, and */
141   /* destroy list of free space and other temporary array(s) */
142   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
143   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
144   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
145 
146   /* put together the new symbolic matrix */
147   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
148   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
149 
150   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
151   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
152   c                         = (Mat_SeqAIJ*)((*C)->data);
153   c->free_a                 = PETSC_FALSE;
154   c->free_ij                = PETSC_TRUE;
155   c->nonew                  = 0;
156   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
157 
158   /* set MatInfo */
159   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
160   if (afill < 1.0) afill = 1.0;
161   c->maxnz                     = ci[am];
162   c->nz                        = ci[am];
163   (*C)->info.mallocs           = ndouble;
164   (*C)->info.fill_ratio_given  = fill;
165   (*C)->info.fill_ratio_needed = afill;
166 
167 #if defined(PETSC_USE_INFO)
168   if (ci[am]) {
169     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
170     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
171   } else {
172     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
173   }
174 #endif
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
180 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
181 {
182   PetscErrorCode ierr;
183   PetscLogDouble flops=0.0;
184   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
185   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
186   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
187   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
188   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
189   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
190   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
191   PetscScalar    *ab_dense;
192 
193   PetscFunctionBegin;
194   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
195     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
196     c->a      = ca;
197     c->free_a = PETSC_TRUE;
198   } else {
199     ca        = c->a;
200   }
201   if (!c->matmult_abdense) {
202     ierr = PetscCalloc1(B->cmap->N,&ab_dense);CHKERRQ(ierr);
203     c->matmult_abdense = ab_dense;
204   } else {
205     ab_dense = c->matmult_abdense;
206   }
207 
208   /* clean old values in C */
209   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
210   /* Traverse A row-wise. */
211   /* Build the ith row in C by summing over nonzero columns in A, */
212   /* the rows of B corresponding to nonzeros of A. */
213   for (i=0; i<am; i++) {
214     anzi = ai[i+1] - ai[i];
215     for (j=0; j<anzi; j++) {
216       brow = aj[j];
217       bnzi = bi[brow+1] - bi[brow];
218       bjj  = bj + bi[brow];
219       baj  = ba + bi[brow];
220       /* perform dense axpy */
221       valtmp = aa[j];
222       for (k=0; k<bnzi; k++) {
223         ab_dense[bjj[k]] += valtmp*baj[k];
224       }
225       flops += 2*bnzi;
226     }
227     aj += anzi; aa += anzi;
228 
229     cnzi = ci[i+1] - ci[i];
230     for (k=0; k<cnzi; k++) {
231       ca[k]          += ab_dense[cj[k]];
232       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
233     }
234     flops += cnzi;
235     cj    += cnzi; ca += cnzi;
236   }
237   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
245 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
246 {
247   PetscErrorCode ierr;
248   PetscLogDouble flops=0.0;
249   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
250   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
251   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
252   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
253   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
254   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
255   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
256   PetscInt       nextb;
257 
258   PetscFunctionBegin;
259   /* clean old values in C */
260   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
261   /* Traverse A row-wise. */
262   /* Build the ith row in C by summing over nonzero columns in A, */
263   /* the rows of B corresponding to nonzeros of A. */
264   for (i=0; i<am; i++) {
265     anzi = ai[i+1] - ai[i];
266     cnzi = ci[i+1] - ci[i];
267     for (j=0; j<anzi; j++) {
268       brow = aj[j];
269       bnzi = bi[brow+1] - bi[brow];
270       bjj  = bj + bi[brow];
271       baj  = ba + bi[brow];
272       /* perform sparse axpy */
273       valtmp = aa[j];
274       nextb  = 0;
275       for (k=0; nextb<bnzi; k++) {
276         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
277           ca[k] += valtmp*baj[nextb++];
278         }
279       }
280       flops += 2*bnzi;
281     }
282     aj += anzi; aa += anzi;
283     cj += cnzi; ca += cnzi;
284   }
285 
286   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
287   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 #undef __FUNCT__
293 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
294 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
295 {
296   PetscErrorCode     ierr;
297   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
298   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
299   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
300   MatScalar          *ca;
301   PetscReal          afill;
302   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
303   PetscTable         ta;
304   PetscFreeSpaceList free_space=NULL,current_space=NULL;
305 
306   PetscFunctionBegin;
307   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
308   /*-----------------------------------------------------------------------------------------*/
309   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
310   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
311   ci[0] = 0;
312 
313   /* create and initialize a linked list */
314   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
315   MatRowMergeMax_SeqAIJ(b,bm,ta);
316   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
317   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
318 
319   ierr = PetscLLCondensedCreate_fast(Crmax,&lnk);CHKERRQ(ierr);
320 
321   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
322   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
323   current_space = free_space;
324 
325   /* Determine ci and cj */
326   for (i=0; i<am; i++) {
327     anzi = ai[i+1] - ai[i];
328     aj   = a->j + ai[i];
329     for (j=0; j<anzi; j++) {
330       brow = aj[j];
331       bnzj = bi[brow+1] - bi[brow];
332       bj   = b->j + bi[brow];
333       /* add non-zero cols of B into the sorted linked list lnk */
334       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
335     }
336     cnzi = lnk[1];
337 
338     /* If free space is not available, make more free space */
339     /* Double the amount of total space in the list */
340     if (current_space->local_remaining<cnzi) {
341       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
342       ndouble++;
343     }
344 
345     /* Copy data into free space, then initialize lnk */
346     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
347 
348     current_space->array           += cnzi;
349     current_space->local_used      += cnzi;
350     current_space->local_remaining -= cnzi;
351 
352     ci[i+1] = ci[i] + cnzi;
353   }
354 
355   /* Column indices are in the list of free space */
356   /* Allocate space for cj, initialize cj, and */
357   /* destroy list of free space and other temporary array(s) */
358   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
359   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
360   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
361 
362   /* Allocate space for ca */
363   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
364   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
365 
366   /* put together the new symbolic matrix */
367   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
368   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
369 
370   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
371   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
372   c          = (Mat_SeqAIJ*)((*C)->data);
373   c->free_a  = PETSC_TRUE;
374   c->free_ij = PETSC_TRUE;
375   c->nonew   = 0;
376 
377   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
378 
379   /* set MatInfo */
380   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
381   if (afill < 1.0) afill = 1.0;
382   c->maxnz                     = ci[am];
383   c->nz                        = ci[am];
384   (*C)->info.mallocs           = ndouble;
385   (*C)->info.fill_ratio_given  = fill;
386   (*C)->info.fill_ratio_needed = afill;
387 
388 #if defined(PETSC_USE_INFO)
389   if (ci[am]) {
390     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
391     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
392   } else {
393     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
394   }
395 #endif
396   PetscFunctionReturn(0);
397 }
398 
399 
400 #undef __FUNCT__
401 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
402 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
403 {
404   PetscErrorCode     ierr;
405   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
406   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
407   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
408   MatScalar          *ca;
409   PetscReal          afill;
410   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
411   PetscTable         ta;
412   PetscFreeSpaceList free_space=NULL,current_space=NULL;
413 
414   PetscFunctionBegin;
415   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
416   /*---------------------------------------------------------------------------------------------*/
417   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
418   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
419   ci[0] = 0;
420 
421   /* create and initialize a linked list */
422   ierr = PetscTableCreate(bn,bn,&ta);CHKERRQ(ierr);
423   MatRowMergeMax_SeqAIJ(b,bm,ta);
424   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
425   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
426   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
427 
428   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
429   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
430   current_space = free_space;
431 
432   /* Determine ci and cj */
433   for (i=0; i<am; i++) {
434     anzi = ai[i+1] - ai[i];
435     aj   = a->j + ai[i];
436     for (j=0; j<anzi; j++) {
437       brow = aj[j];
438       bnzj = bi[brow+1] - bi[brow];
439       bj   = b->j + bi[brow];
440       /* add non-zero cols of B into the sorted linked list lnk */
441       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
442     }
443     cnzi = lnk[0];
444 
445     /* If free space is not available, make more free space */
446     /* Double the amount of total space in the list */
447     if (current_space->local_remaining<cnzi) {
448       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);CHKERRQ(ierr);
449       ndouble++;
450     }
451 
452     /* Copy data into free space, then initialize lnk */
453     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
454 
455     current_space->array           += cnzi;
456     current_space->local_used      += cnzi;
457     current_space->local_remaining -= cnzi;
458 
459     ci[i+1] = ci[i] + cnzi;
460   }
461 
462   /* Column indices are in the list of free space */
463   /* Allocate space for cj, initialize cj, and */
464   /* destroy list of free space and other temporary array(s) */
465   ierr = PetscMalloc1(ci[am]+1,&cj);CHKERRQ(ierr);
466   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
467   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
468 
469   /* Allocate space for ca */
470   /*-----------------------*/
471   ierr = PetscMalloc1(ci[am]+1,&ca);CHKERRQ(ierr);
472   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
473 
474   /* put together the new symbolic matrix */
475   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
476   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
477 
478   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
479   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
480   c          = (Mat_SeqAIJ*)((*C)->data);
481   c->free_a  = PETSC_TRUE;
482   c->free_ij = PETSC_TRUE;
483   c->nonew   = 0;
484 
485   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
486 
487   /* set MatInfo */
488   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
489   if (afill < 1.0) afill = 1.0;
490   c->maxnz                     = ci[am];
491   c->nz                        = ci[am];
492   (*C)->info.mallocs           = ndouble;
493   (*C)->info.fill_ratio_given  = fill;
494   (*C)->info.fill_ratio_needed = afill;
495 
496 #if defined(PETSC_USE_INFO)
497   if (ci[am]) {
498     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
499     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
500   } else {
501     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
502   }
503 #endif
504   PetscFunctionReturn(0);
505 }
506 
507 #undef __FUNCT__
508 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
509 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
510 {
511   PetscErrorCode     ierr;
512   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
513   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
514   PetscInt           *ci,*cj,*bb;
515   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
516   PetscReal          afill;
517   PetscInt           i,j,col,ndouble = 0;
518   PetscFreeSpaceList free_space=NULL,current_space=NULL;
519   PetscHeap          h;
520 
521   PetscFunctionBegin;
522   /* Get ci and cj - by merging sorted rows using a heap */
523   /*---------------------------------------------------------------------------------------------*/
524   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
525   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
526   ci[0] = 0;
527 
528   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
529   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
530   current_space = free_space;
531 
532   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
533   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
534 
535   /* Determine ci and cj */
536   for (i=0; i<am; i++) {
537     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
538     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
539     ci[i+1] = ci[i];
540     /* Populate the min heap */
541     for (j=0; j<anzi; j++) {
542       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
543       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
544         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
545       }
546     }
547     /* Pick off the min element, adding it to free space */
548     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
549     while (j >= 0) {
550       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
551         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
552         ndouble++;
553       }
554       *(current_space->array++) = col;
555       current_space->local_used++;
556       current_space->local_remaining--;
557       ci[i+1]++;
558 
559       /* stash if anything else remains in this row of B */
560       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
561       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
562         PetscInt j2,col2;
563         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
564         if (col2 != col) break;
565         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
566         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
567       }
568       /* Put any stashed elements back into the min heap */
569       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
570       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
571     }
572   }
573   ierr = PetscFree(bb);CHKERRQ(ierr);
574   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
575 
576   /* Column indices are in the list of free space */
577   /* Allocate space for cj, initialize cj, and */
578   /* destroy list of free space and other temporary array(s) */
579   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
580   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
581 
582   /* put together the new symbolic matrix */
583   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
584   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
585 
586   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
587   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
588   c          = (Mat_SeqAIJ*)((*C)->data);
589   c->free_a  = PETSC_TRUE;
590   c->free_ij = PETSC_TRUE;
591   c->nonew   = 0;
592 
593   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
594 
595   /* set MatInfo */
596   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
597   if (afill < 1.0) afill = 1.0;
598   c->maxnz                     = ci[am];
599   c->nz                        = ci[am];
600   (*C)->info.mallocs           = ndouble;
601   (*C)->info.fill_ratio_given  = fill;
602   (*C)->info.fill_ratio_needed = afill;
603 
604 #if defined(PETSC_USE_INFO)
605   if (ci[am]) {
606     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
607     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
608   } else {
609     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
610   }
611 #endif
612   PetscFunctionReturn(0);
613 }
614 
615 #undef __FUNCT__
616 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap"
617 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
618 {
619   PetscErrorCode     ierr;
620   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
621   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
622   PetscInt           *ci,*cj,*bb;
623   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
624   PetscReal          afill;
625   PetscInt           i,j,col,ndouble = 0;
626   PetscFreeSpaceList free_space=NULL,current_space=NULL;
627   PetscHeap          h;
628   PetscBT            bt;
629 
630   PetscFunctionBegin;
631   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
632   /*---------------------------------------------------------------------------------------------*/
633   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
634   ierr  = PetscMalloc1(am+2,&ci);CHKERRQ(ierr);
635   ci[0] = 0;
636 
637   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
638   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);CHKERRQ(ierr);
639 
640   current_space = free_space;
641 
642   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
643   ierr = PetscMalloc1(a->rmax,&bb);CHKERRQ(ierr);
644   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
645 
646   /* Determine ci and cj */
647   for (i=0; i<am; i++) {
648     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
649     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
650     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
651     ci[i+1] = ci[i];
652     /* Populate the min heap */
653     for (j=0; j<anzi; j++) {
654       PetscInt brow = acol[j];
655       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
656         PetscInt bcol = bj[bb[j]];
657         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
658           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
659           bb[j]++;
660           break;
661         }
662       }
663     }
664     /* Pick off the min element, adding it to free space */
665     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
666     while (j >= 0) {
667       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
668         fptr = NULL;                      /* need PetscBTMemzero */
669         ierr = PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);CHKERRQ(ierr);
670         ndouble++;
671       }
672       *(current_space->array++) = col;
673       current_space->local_used++;
674       current_space->local_remaining--;
675       ci[i+1]++;
676 
677       /* stash if anything else remains in this row of B */
678       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
679         PetscInt bcol = bj[bb[j]];
680         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
681           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
682           bb[j]++;
683           break;
684         }
685       }
686       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
687     }
688     if (fptr) {                 /* Clear the bits for this row */
689       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
690     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
691       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
692     }
693   }
694   ierr = PetscFree(bb);CHKERRQ(ierr);
695   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
696   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
697 
698   /* Column indices are in the list of free space */
699   /* Allocate space for cj, initialize cj, and */
700   /* destroy list of free space and other temporary array(s) */
701   ierr = PetscMalloc1(ci[am],&cj);CHKERRQ(ierr);
702   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
703 
704   /* put together the new symbolic matrix */
705   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
706   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
707 
708   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
709   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
710   c          = (Mat_SeqAIJ*)((*C)->data);
711   c->free_a  = PETSC_TRUE;
712   c->free_ij = PETSC_TRUE;
713   c->nonew   = 0;
714 
715   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
716 
717   /* set MatInfo */
718   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
719   if (afill < 1.0) afill = 1.0;
720   c->maxnz                     = ci[am];
721   c->nz                        = ci[am];
722   (*C)->info.mallocs           = ndouble;
723   (*C)->info.fill_ratio_given  = fill;
724   (*C)->info.fill_ratio_needed = afill;
725 
726 #if defined(PETSC_USE_INFO)
727   if (ci[am]) {
728     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
729     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
730   } else {
731     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
732   }
733 #endif
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
739 /* concatenate unique entries and then sort */
740 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
741 {
742   PetscErrorCode     ierr;
743   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
744   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
745   PetscInt           *ci,*cj;
746   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
747   PetscReal          afill;
748   PetscInt           i,j,ndouble = 0;
749   PetscSegBuffer     seg,segrow;
750   char               *seen;
751 
752   PetscFunctionBegin;
753   ierr  = PetscMalloc1(am+1,&ci);CHKERRQ(ierr);
754   ci[0] = 0;
755 
756   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
757   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
758   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
759   ierr = PetscMalloc1(bn,&seen);CHKERRQ(ierr);
760   ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr);
761 
762   /* Determine ci and cj */
763   for (i=0; i<am; i++) {
764     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
765     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
766     PetscInt packlen = 0,*PETSC_RESTRICT crow;
767     /* Pack segrow */
768     for (j=0; j<anzi; j++) {
769       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
770       for (k=bjstart; k<bjend; k++) {
771         PetscInt bcol = bj[k];
772         if (!seen[bcol]) { /* new entry */
773           PetscInt *PETSC_RESTRICT slot;
774           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
775           *slot = bcol;
776           seen[bcol] = 1;
777           packlen++;
778         }
779       }
780     }
781     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
782     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
783     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
784     ci[i+1] = ci[i] + packlen;
785     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
786   }
787   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
788   ierr = PetscFree(seen);CHKERRQ(ierr);
789 
790   /* Column indices are in the segmented buffer */
791   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
792   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
793 
794   /* put together the new symbolic matrix */
795   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
796   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
797 
798   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
799   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
800   c          = (Mat_SeqAIJ*)((*C)->data);
801   c->free_a  = PETSC_TRUE;
802   c->free_ij = PETSC_TRUE;
803   c->nonew   = 0;
804 
805   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
806 
807   /* set MatInfo */
808   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
809   if (afill < 1.0) afill = 1.0;
810   c->maxnz                     = ci[am];
811   c->nz                        = ci[am];
812   (*C)->info.mallocs           = ndouble;
813   (*C)->info.fill_ratio_given  = fill;
814   (*C)->info.fill_ratio_needed = afill;
815 
816 #if defined(PETSC_USE_INFO)
817   if (ci[am]) {
818     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);CHKERRQ(ierr);
819     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
820   } else {
821     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
822   }
823 #endif
824   PetscFunctionReturn(0);
825 }
826 
827 /* This routine is not used. Should be removed! */
828 #undef __FUNCT__
829 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
830 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
831 {
832   PetscErrorCode ierr;
833 
834   PetscFunctionBegin;
835   if (scall == MAT_INITIAL_MATRIX) {
836     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
837     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
838     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
839   }
840   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
841   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
842   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
843   PetscFunctionReturn(0);
844 }
845 
846 #undef __FUNCT__
847 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
848 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
849 {
850   PetscErrorCode      ierr;
851   Mat_SeqAIJ          *a=(Mat_SeqAIJ*)A->data;
852   Mat_MatMatTransMult *abt=a->abt;
853 
854   PetscFunctionBegin;
855   ierr = (abt->destroy)(A);CHKERRQ(ierr);
856   ierr = MatTransposeColoringDestroy(&abt->matcoloring);CHKERRQ(ierr);
857   ierr = MatDestroy(&abt->Bt_den);CHKERRQ(ierr);
858   ierr = MatDestroy(&abt->ABt_den);CHKERRQ(ierr);
859   ierr = PetscFree(abt);CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
865 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
866 {
867   PetscErrorCode      ierr;
868   Mat                 Bt;
869   PetscInt            *bti,*btj;
870   Mat_MatMatTransMult *abt;
871   Mat_SeqAIJ          *c;
872 
873   PetscFunctionBegin;
874   /* create symbolic Bt */
875   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
876   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
877   ierr = MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
878 
879   /* get symbolic C=A*Bt */
880   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
881 
882   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
883   ierr   = PetscNew(&abt);CHKERRQ(ierr);
884   c      = (Mat_SeqAIJ*)(*C)->data;
885   c->abt = abt;
886 
887   abt->usecoloring = PETSC_FALSE;
888   abt->destroy     = (*C)->ops->destroy;
889   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
890 
891   ierr = PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);CHKERRQ(ierr);
892   if (abt->usecoloring) {
893     /* Create MatTransposeColoring from symbolic C=A*B^T */
894     MatTransposeColoring matcoloring;
895     MatColoring          coloring;
896     ISColoring           iscoloring;
897     Mat                  Bt_dense,C_dense;
898     Mat_SeqAIJ           *c=(Mat_SeqAIJ*)(*C)->data;
899     /* inode causes memory problem, don't know why */
900     if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
901 
902     ierr = MatColoringCreate(*C,&coloring);CHKERRQ(ierr);
903     ierr = MatColoringSetDistance(coloring,2);CHKERRQ(ierr);
904     ierr = MatColoringSetType(coloring,MATCOLORINGSL);CHKERRQ(ierr);
905     ierr = MatColoringSetFromOptions(coloring);CHKERRQ(ierr);
906     ierr = MatColoringApply(coloring,&iscoloring);CHKERRQ(ierr);
907     ierr = MatColoringDestroy(&coloring);CHKERRQ(ierr);
908     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
909 
910     abt->matcoloring = matcoloring;
911 
912     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
913 
914     /* Create Bt_dense and C_dense = A*Bt_dense */
915     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
916     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
917     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
918     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
919 
920     Bt_dense->assembled = PETSC_TRUE;
921     abt->Bt_den   = Bt_dense;
922 
923     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
924     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
925     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
926     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
927 
928     Bt_dense->assembled = PETSC_TRUE;
929     abt->ABt_den  = C_dense;
930 
931 #if defined(PETSC_USE_INFO)
932     {
933       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
934       ierr = PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
935     }
936 #endif
937   }
938   /* clean up */
939   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
940   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 #undef __FUNCT__
945 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
946 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
947 {
948   PetscErrorCode      ierr;
949   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
950   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
951   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
952   PetscLogDouble      flops=0.0;
953   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
954   Mat_MatMatTransMult *abt = c->abt;
955 
956   PetscFunctionBegin;
957   /* clear old values in C */
958   if (!c->a) {
959     ierr      = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
960     c->a      = ca;
961     c->free_a = PETSC_TRUE;
962   } else {
963     ca =  c->a;
964   }
965   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
966 
967   if (abt->usecoloring) {
968     MatTransposeColoring matcoloring = abt->matcoloring;
969     Mat                  Bt_dense,C_dense = abt->ABt_den;
970 
971     /* Get Bt_dense by Apply MatTransposeColoring to B */
972     Bt_dense = abt->Bt_den;
973     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
974 
975     /* C_dense = A*Bt_dense */
976     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
977 
978     /* Recover C from C_dense */
979     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
980     PetscFunctionReturn(0);
981   }
982 
983   for (i=0; i<cm; i++) {
984     anzi = ai[i+1] - ai[i];
985     acol = aj + ai[i];
986     aval = aa + ai[i];
987     cnzi = ci[i+1] - ci[i];
988     ccol = cj + ci[i];
989     cval = ca + ci[i];
990     for (j=0; j<cnzi; j++) {
991       brow = ccol[j];
992       bnzj = bi[brow+1] - bi[brow];
993       bcol = bj + bi[brow];
994       bval = ba + bi[brow];
995 
996       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
997       nexta = 0; nextb = 0;
998       while (nexta<anzi && nextb<bnzj) {
999         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1000         if (nexta == anzi) break;
1001         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1002         if (nextb == bnzj) break;
1003         if (acol[nexta] == bcol[nextb]) {
1004           cval[j] += aval[nexta]*bval[nextb];
1005           nexta++; nextb++;
1006           flops += 2;
1007         }
1008       }
1009     }
1010   }
1011   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1012   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1013   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
1019 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1020 {
1021   PetscErrorCode ierr;
1022 
1023   PetscFunctionBegin;
1024   if (scall == MAT_INITIAL_MATRIX) {
1025     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1026     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1027     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1028   }
1029   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1030   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
1031   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 #undef __FUNCT__
1036 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
1037 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1038 {
1039   PetscErrorCode ierr;
1040   Mat            At;
1041   PetscInt       *ati,*atj;
1042 
1043   PetscFunctionBegin;
1044   /* create symbolic At */
1045   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1046   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1047   ierr = MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));CHKERRQ(ierr);
1048 
1049   /* get symbolic C=At*B */
1050   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1051 
1052   /* clean up */
1053   ierr = MatDestroy(&At);CHKERRQ(ierr);
1054   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 #undef __FUNCT__
1059 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
1060 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1061 {
1062   PetscErrorCode ierr;
1063   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1064   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1065   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1066   PetscLogDouble flops=0.0;
1067   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1068 
1069   PetscFunctionBegin;
1070   if (!c->a) {
1071     ierr = PetscMalloc1(ci[cm]+1,&ca);CHKERRQ(ierr);
1072 
1073     c->a      = ca;
1074     c->free_a = PETSC_TRUE;
1075   } else {
1076     ca = c->a;
1077   }
1078   /* clear old values in C */
1079   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1080 
1081   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1082   for (i=0; i<am; i++) {
1083     bj   = b->j + bi[i];
1084     ba   = b->a + bi[i];
1085     bnzi = bi[i+1] - bi[i];
1086     anzi = ai[i+1] - ai[i];
1087     for (j=0; j<anzi; j++) {
1088       nextb = 0;
1089       crow  = *aj++;
1090       cjj   = cj + ci[crow];
1091       caj   = ca + ci[crow];
1092       /* perform sparse axpy operation.  Note cjj includes bj. */
1093       for (k=0; nextb<bnzi; k++) {
1094         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1095           caj[k] += (*aa)*(*(ba+nextb));
1096           nextb++;
1097         }
1098       }
1099       flops += 2*bnzi;
1100       aa++;
1101     }
1102   }
1103 
1104   /* Assemble the final matrix and clean up */
1105   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1106   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1107   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 #undef __FUNCT__
1112 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1113 PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1114 {
1115   PetscErrorCode ierr;
1116 
1117   PetscFunctionBegin;
1118   if (scall == MAT_INITIAL_MATRIX) {
1119     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1120     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1121     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1122   }
1123   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1124   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1125   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 #undef __FUNCT__
1130 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1131 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1132 {
1133   PetscErrorCode ierr;
1134 
1135   PetscFunctionBegin;
1136   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1137 
1138   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1139   PetscFunctionReturn(0);
1140 }
1141 
1142 #undef __FUNCT__
1143 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1144 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1145 {
1146   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1147   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1148   PetscErrorCode    ierr;
1149   PetscScalar       *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1150   const PetscScalar *aa,*b1,*b2,*b3,*b4;
1151   const PetscInt    *aj;
1152   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1153   PetscInt          am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1154 
1155   PetscFunctionBegin;
1156   if (!cm || !cn) PetscFunctionReturn(0);
1157   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1158   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1159   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1160   b = bd->v;
1161   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1162   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1163   c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1164   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1165     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1166       r1 = r2 = r3 = r4 = 0.0;
1167       n  = a->i[i+1] - a->i[i];
1168       aj = a->j + a->i[i];
1169       aa = a->a + a->i[i];
1170       for (j=0; j<n; j++) {
1171         aatmp = aa[j]; ajtmp = aj[j];
1172         r1 += aatmp*b1[ajtmp];
1173         r2 += aatmp*b2[ajtmp];
1174         r3 += aatmp*b3[ajtmp];
1175         r4 += aatmp*b4[ajtmp];
1176       }
1177       c1[i] = r1;
1178       c2[i] = r2;
1179       c3[i] = r3;
1180       c4[i] = r4;
1181     }
1182     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1183     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1184   }
1185   for (; col<cn; col++) {   /* over extra columns of C */
1186     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1187       r1 = 0.0;
1188       n  = a->i[i+1] - a->i[i];
1189       aj = a->j + a->i[i];
1190       aa = a->a + a->i[i];
1191       for (j=0; j<n; j++) {
1192         r1 += aa[j]*b1[aj[j]];
1193       }
1194       c1[i] = r1;
1195     }
1196     b1 += bm;
1197     c1 += am;
1198   }
1199   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1200   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1201   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1202   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 /*
1207    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1208 */
1209 #undef __FUNCT__
1210 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1211 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1212 {
1213   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1214   Mat_SeqDense   *bd = (Mat_SeqDense*)B->data;
1215   PetscErrorCode ierr;
1216   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1217   MatScalar      *aa;
1218   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1219   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1220 
1221   PetscFunctionBegin;
1222   if (!cm || !cn) PetscFunctionReturn(0);
1223   b = bd->v;
1224   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1225   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1226 
1227   if (a->compressedrow.use) { /* use compressed row format */
1228     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1229       colam = col*am;
1230       arm   = a->compressedrow.nrows;
1231       ii    = a->compressedrow.i;
1232       ridx  = a->compressedrow.rindex;
1233       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1234         r1 = r2 = r3 = r4 = 0.0;
1235         n  = ii[i+1] - ii[i];
1236         aj = a->j + ii[i];
1237         aa = a->a + ii[i];
1238         for (j=0; j<n; j++) {
1239           r1 += (*aa)*b1[*aj];
1240           r2 += (*aa)*b2[*aj];
1241           r3 += (*aa)*b3[*aj];
1242           r4 += (*aa++)*b4[*aj++];
1243         }
1244         c[colam       + ridx[i]] += r1;
1245         c[colam + am  + ridx[i]] += r2;
1246         c[colam + am2 + ridx[i]] += r3;
1247         c[colam + am3 + ridx[i]] += r4;
1248       }
1249       b1 += bm4;
1250       b2 += bm4;
1251       b3 += bm4;
1252       b4 += bm4;
1253     }
1254     for (; col<cn; col++) {     /* over extra columns of C */
1255       colam = col*am;
1256       arm   = a->compressedrow.nrows;
1257       ii    = a->compressedrow.i;
1258       ridx  = a->compressedrow.rindex;
1259       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1260         r1 = 0.0;
1261         n  = ii[i+1] - ii[i];
1262         aj = a->j + ii[i];
1263         aa = a->a + ii[i];
1264 
1265         for (j=0; j<n; j++) {
1266           r1 += (*aa++)*b1[*aj++];
1267         }
1268         c[colam + ridx[i]] += r1;
1269       }
1270       b1 += bm;
1271     }
1272   } else {
1273     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1274       colam = col*am;
1275       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1276         r1 = r2 = r3 = r4 = 0.0;
1277         n  = a->i[i+1] - a->i[i];
1278         aj = a->j + a->i[i];
1279         aa = a->a + a->i[i];
1280         for (j=0; j<n; j++) {
1281           r1 += (*aa)*b1[*aj];
1282           r2 += (*aa)*b2[*aj];
1283           r3 += (*aa)*b3[*aj];
1284           r4 += (*aa++)*b4[*aj++];
1285         }
1286         c[colam + i]       += r1;
1287         c[colam + am + i]  += r2;
1288         c[colam + am2 + i] += r3;
1289         c[colam + am3 + i] += r4;
1290       }
1291       b1 += bm4;
1292       b2 += bm4;
1293       b3 += bm4;
1294       b4 += bm4;
1295     }
1296     for (; col<cn; col++) {     /* over extra columns of C */
1297       colam = col*am;
1298       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1299         r1 = 0.0;
1300         n  = a->i[i+1] - a->i[i];
1301         aj = a->j + a->i[i];
1302         aa = a->a + a->i[i];
1303 
1304         for (j=0; j<n; j++) {
1305           r1 += (*aa++)*b1[*aj++];
1306         }
1307         c[colam + i] += r1;
1308       }
1309       b1 += bm;
1310     }
1311   }
1312   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1313   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 #undef __FUNCT__
1318 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1319 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1320 {
1321   PetscErrorCode ierr;
1322   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1323   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1324   PetscInt       *bi      = b->i,*bj=b->j;
1325   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1326   MatScalar      *btval,*btval_den,*ba=b->a;
1327   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1328 
1329   PetscFunctionBegin;
1330   btval_den=btdense->v;
1331   ierr     = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1332   for (k=0; k<ncolors; k++) {
1333     ncolumns = coloring->ncolumns[k];
1334     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1335       col   = *(columns + colorforcol[k] + l);
1336       btcol = bj + bi[col];
1337       btval = ba + bi[col];
1338       anz   = bi[col+1] - bi[col];
1339       for (j=0; j<anz; j++) {
1340         brow            = btcol[j];
1341         btval_den[brow] = btval[j];
1342       }
1343     }
1344     btval_den += m;
1345   }
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 #undef __FUNCT__
1350 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1351 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1352 {
1353   PetscErrorCode ierr;
1354   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1355   PetscScalar    *ca_den,*ca_den_ptr,*ca=csp->a;
1356   PetscInt       k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1357   PetscInt       brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1358   PetscInt       nrows,*row,*idx;
1359   PetscInt       *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1360 
1361   PetscFunctionBegin;
1362   ierr   = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1363 
1364   if (brows > 0) {
1365     PetscInt *lstart,row_end,row_start;
1366     lstart = matcoloring->lstart;
1367     ierr = PetscMemzero(lstart,ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1368 
1369     row_end = brows;
1370     if (row_end > m) row_end = m;
1371     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1372       ca_den_ptr = ca_den;
1373       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1374         nrows = matcoloring->nrows[k];
1375         row   = rows  + colorforrow[k];
1376         idx   = den2sp + colorforrow[k];
1377         for (l=lstart[k]; l<nrows; l++) {
1378           if (row[l] >= row_end) {
1379             lstart[k] = l;
1380             break;
1381           } else {
1382             ca[idx[l]] = ca_den_ptr[row[l]];
1383           }
1384         }
1385         ca_den_ptr += m;
1386       }
1387       row_end += brows;
1388       if (row_end > m) row_end = m;
1389     }
1390   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1391     ca_den_ptr = ca_den;
1392     for (k=0; k<ncolors; k++) {
1393       nrows = matcoloring->nrows[k];
1394       row   = rows  + colorforrow[k];
1395       idx   = den2sp + colorforrow[k];
1396       for (l=0; l<nrows; l++) {
1397         ca[idx[l]] = ca_den_ptr[row[l]];
1398       }
1399       ca_den_ptr += m;
1400     }
1401   }
1402 
1403   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1404 #if defined(PETSC_USE_INFO)
1405   if (matcoloring->brows > 0) {
1406     ierr = PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);CHKERRQ(ierr);
1407   } else {
1408     ierr = PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");CHKERRQ(ierr);
1409   }
1410 #endif
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 #undef __FUNCT__
1415 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1416 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1417 {
1418   PetscErrorCode ierr;
1419   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1420   const PetscInt *is,*ci,*cj,*row_idx;
1421   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1422   IS             *isa;
1423   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1424   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1425   PetscInt       *colorforcol,*columns,*columns_i,brows;
1426   PetscBool      flg;
1427 
1428   PetscFunctionBegin;
1429   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1430 
1431   /* bs >1 is not being tested yet! */
1432   Nbs       = mat->cmap->N/bs;
1433   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1434   c->N      = Nbs;
1435   c->m      = c->M;
1436   c->rstart = 0;
1437   c->brows  = 100;
1438 
1439   c->ncolors = nis;
1440   ierr = PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);CHKERRQ(ierr);
1441   ierr = PetscMalloc1(csp->nz+1,&rows);CHKERRQ(ierr);
1442   ierr = PetscMalloc1(csp->nz+1,&den2sp);CHKERRQ(ierr);
1443 
1444   brows = c->brows;
1445   ierr = PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);CHKERRQ(ierr);
1446   if (flg) c->brows = brows;
1447   if (brows > 0) {
1448     ierr = PetscMalloc1(nis+1,&c->lstart);CHKERRQ(ierr);
1449   }
1450 
1451   colorforrow[0] = 0;
1452   rows_i         = rows;
1453   den2sp_i       = den2sp;
1454 
1455   ierr = PetscMalloc1(nis+1,&colorforcol);CHKERRQ(ierr);
1456   ierr = PetscMalloc1(Nbs+1,&columns);CHKERRQ(ierr);
1457 
1458   colorforcol[0] = 0;
1459   columns_i      = columns;
1460 
1461   /* get column-wise storage of mat */
1462   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1463 
1464   cm   = c->m;
1465   ierr = PetscMalloc1(cm+1,&rowhit);CHKERRQ(ierr);
1466   ierr = PetscMalloc1(cm+1,&idxhit);CHKERRQ(ierr);
1467   for (i=0; i<nis; i++) { /* loop over color */
1468     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1469     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1470 
1471     c->ncolumns[i] = n;
1472     if (n) {
1473       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1474     }
1475     colorforcol[i+1] = colorforcol[i] + n;
1476     columns_i       += n;
1477 
1478     /* fast, crude version requires O(N*N) work */
1479     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1480 
1481     for (j=0; j<n; j++) { /* loop over columns*/
1482       col     = is[j];
1483       row_idx = cj + ci[col];
1484       m       = ci[col+1] - ci[col];
1485       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1486         idxhit[*row_idx]   = spidx[ci[col] + k];
1487         rowhit[*row_idx++] = col + 1;
1488       }
1489     }
1490     /* count the number of hits */
1491     nrows = 0;
1492     for (j=0; j<cm; j++) {
1493       if (rowhit[j]) nrows++;
1494     }
1495     c->nrows[i]      = nrows;
1496     colorforrow[i+1] = colorforrow[i] + nrows;
1497 
1498     nrows = 0;
1499     for (j=0; j<cm; j++) { /* loop over rows */
1500       if (rowhit[j]) {
1501         rows_i[nrows]   = j;
1502         den2sp_i[nrows] = idxhit[j];
1503         nrows++;
1504       }
1505     }
1506     den2sp_i += nrows;
1507 
1508     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1509     rows_i += nrows;
1510   }
1511   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1512   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1513   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1514   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1515 
1516   c->colorforrow = colorforrow;
1517   c->rows        = rows;
1518   c->den2sp      = den2sp;
1519   c->colorforcol = colorforcol;
1520   c->columns     = columns;
1521 
1522   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1523   PetscFunctionReturn(0);
1524 }
1525