xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision fb8e56e08d4d0bfe9fc63603ca1f7fddd68abbdb) !
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
17 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
18 {
19   PetscErrorCode ierr;
20   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE,heap = PETSC_FALSE,btheap = PETSC_FALSE,llcondensed = PETSC_FALSE;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsBool("-matmatmult_heap","Use heap implementation of symbolic factorization C=A*B","",heap,&heap,NULL);CHKERRQ(ierr);
28     ierr = PetscOptionsBool("-matmatmult_btheap","Use btheap implementation of symbolic factorization C=A*B","",btheap,&btheap,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsBool("-matmatmult_llcondensed","Use LLCondensed to for symbolic C=A*B","",llcondensed,&llcondensed,NULL);CHKERRQ(ierr);
30     ierr = PetscOptionsEnd();CHKERRQ(ierr);
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     if (scalable_fast) {
33       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
34     } else if (scalable) {
35       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
36     } else if (heap) {
37       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
38     } else if (btheap) {
39       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
40     } else if (llcondensed) {
41       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);CHKERRQ(ierr);
42     } else {
43       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
44     }
45     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
46   }
47 
48   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
49   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
50   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed"
56 static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
57 {
58   PetscErrorCode     ierr;
59   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
60   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
61   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
62   PetscReal          afill;
63   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
64   PetscBT            lnkbt;
65   PetscFreeSpaceList free_space=NULL,current_space=NULL;
66 
67   PetscFunctionBegin;
68   /* Get ci and cj */
69   /*---------------*/
70   /* Allocate ci array, arrays for fill computation and */
71   /* free space for accumulating nonzero column info */
72   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
73   ci[0] = 0;
74 
75   /* create and initialize a linked list */
76   nlnk_max = a->rmax*b->rmax;
77   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
78   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
79 
80   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
81   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
82 
83   current_space = free_space;
84 
85   /* Determine ci and cj */
86   for (i=0; i<am; i++) {
87     anzi = ai[i+1] - ai[i];
88     aj   = a->j + ai[i];
89     for (j=0; j<anzi; j++) {
90       brow = aj[j];
91       bnzj = bi[brow+1] - bi[brow];
92       bj   = b->j + bi[brow];
93       /* add non-zero cols of B into the sorted linked list lnk */
94       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
95     }
96     cnzi = lnk[0];
97 
98     /* If free space is not available, make more free space */
99     /* Double the amount of total space in the list */
100     if (current_space->local_remaining<cnzi) {
101       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
102       ndouble++;
103     }
104 
105     /* Copy data into free space, then initialize lnk */
106     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
107 
108     current_space->array           += cnzi;
109     current_space->local_used      += cnzi;
110     current_space->local_remaining -= cnzi;
111 
112     ci[i+1] = ci[i] + cnzi;
113   }
114 
115   /* Column indices are in the list of free space */
116   /* Allocate space for cj, initialize cj, and */
117   /* destroy list of free space and other temporary array(s) */
118   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
119   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
120   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
121 
122   /* put together the new symbolic matrix */
123   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
124 
125   (*C)->rmap->bs = A->rmap->bs;
126   (*C)->cmap->bs = B->cmap->bs;
127 
128   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
129   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
130   c                         = (Mat_SeqAIJ*)((*C)->data);
131   c->free_a                 = PETSC_FALSE;
132   c->free_ij                = PETSC_TRUE;
133   c->nonew                  = 0;
134   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
135 
136   /* set MatInfo */
137   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
138   if (afill < 1.0) afill = 1.0;
139   c->maxnz                     = ci[am];
140   c->nz                        = ci[am];
141   (*C)->info.mallocs           = ndouble;
142   (*C)->info.fill_ratio_given  = fill;
143   (*C)->info.fill_ratio_needed = afill;
144 
145 #if defined(PETSC_USE_INFO)
146   if (ci[am]) {
147     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
148     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
149   } else {
150     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
151   }
152 #endif
153   PetscFunctionReturn(0);
154 }
155 
156 #undef __FUNCT__
157 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
158 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
159 {
160   PetscErrorCode ierr;
161   PetscLogDouble flops=0.0;
162   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
163   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
164   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
165   PetscInt       *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
166   PetscInt       am   =A->rmap->n,cm=C->rmap->n;
167   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
168   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
169   PetscScalar    *ab_dense;
170 
171   PetscFunctionBegin;
172   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
173   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
174     ierr      = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
175     c->a      = ca;
176     c->free_a = PETSC_TRUE;
177 
178     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
179     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
180 
181     c->matmult_abdense = ab_dense;
182   } else {
183     ca       = c->a;
184     ab_dense = c->matmult_abdense;
185   }
186 
187   /* clean old values in C */
188   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
189   /* Traverse A row-wise. */
190   /* Build the ith row in C by summing over nonzero columns in A, */
191   /* the rows of B corresponding to nonzeros of A. */
192   for (i=0; i<am; i++) {
193     anzi = ai[i+1] - ai[i];
194     for (j=0; j<anzi; j++) {
195       brow = aj[j];
196       bnzi = bi[brow+1] - bi[brow];
197       bjj  = bj + bi[brow];
198       baj  = ba + bi[brow];
199       /* perform dense axpy */
200       valtmp = aa[j];
201       for (k=0; k<bnzi; k++) {
202         ab_dense[bjj[k]] += valtmp*baj[k];
203       }
204       flops += 2*bnzi;
205     }
206     aj += anzi; aa += anzi;
207 
208     cnzi = ci[i+1] - ci[i];
209     for (k=0; k<cnzi; k++) {
210       ca[k]          += ab_dense[cj[k]];
211       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
212     }
213     flops += cnzi;
214     cj    += cnzi; ca += cnzi;
215   }
216   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
217   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
218   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 #undef __FUNCT__
223 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
224 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
225 {
226   PetscErrorCode ierr;
227   PetscLogDouble flops=0.0;
228   Mat_SeqAIJ     *a   = (Mat_SeqAIJ*)A->data;
229   Mat_SeqAIJ     *b   = (Mat_SeqAIJ*)B->data;
230   Mat_SeqAIJ     *c   = (Mat_SeqAIJ*)C->data;
231   PetscInt       *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
232   PetscInt       am   = A->rmap->N,cm=C->rmap->N;
233   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
234   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
235   PetscInt       nextb;
236 
237   PetscFunctionBegin;
238   /* clean old values in C */
239   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
240   /* Traverse A row-wise. */
241   /* Build the ith row in C by summing over nonzero columns in A, */
242   /* the rows of B corresponding to nonzeros of A. */
243   for (i=0; i<am; i++) {
244     anzi = ai[i+1] - ai[i];
245     cnzi = ci[i+1] - ci[i];
246     for (j=0; j<anzi; j++) {
247       brow = aj[j];
248       bnzi = bi[brow+1] - bi[brow];
249       bjj  = bj + bi[brow];
250       baj  = ba + bi[brow];
251       /* perform sparse axpy */
252       valtmp = aa[j];
253       nextb  = 0;
254       for (k=0; nextb<bnzi; k++) {
255         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
256           ca[k] += valtmp*baj[nextb++];
257         }
258       }
259       flops += 2*bnzi;
260     }
261     aj += anzi; aa += anzi;
262     cj += cnzi; ca += cnzi;
263   }
264 
265   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
266   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
267   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
268   PetscFunctionReturn(0);
269 }
270 
271 #undef __FUNCT__
272 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
273 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
274 {
275   PetscErrorCode     ierr;
276   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
277   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
278   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
279   MatScalar          *ca;
280   PetscReal          afill;
281   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
282   PetscFreeSpaceList free_space=NULL,current_space=NULL;
283 
284   PetscFunctionBegin;
285   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
286   /*-----------------------------------------------------------------------------------------*/
287   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
288   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
289   ci[0] = 0;
290 
291   /* create and initialize a linked list */
292   nlnk_max = a->rmax*b->rmax;
293   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
294   ierr = PetscLLCondensedCreate_fast(nlnk_max,&lnk);CHKERRQ(ierr);
295 
296   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
297   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
298   current_space = free_space;
299 
300   /* Determine ci and cj */
301   for (i=0; i<am; i++) {
302     anzi = ai[i+1] - ai[i];
303     aj   = a->j + ai[i];
304     for (j=0; j<anzi; j++) {
305       brow = aj[j];
306       bnzj = bi[brow+1] - bi[brow];
307       bj   = b->j + bi[brow];
308       /* add non-zero cols of B into the sorted linked list lnk */
309       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
310     }
311     cnzi = lnk[1];
312 
313     /* If free space is not available, make more free space */
314     /* Double the amount of total space in the list */
315     if (current_space->local_remaining<cnzi) {
316       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
317       ndouble++;
318     }
319 
320     /* Copy data into free space, then initialize lnk */
321     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
322 
323     current_space->array           += cnzi;
324     current_space->local_used      += cnzi;
325     current_space->local_remaining -= cnzi;
326 
327     ci[i+1] = ci[i] + cnzi;
328   }
329 
330   /* Column indices are in the list of free space */
331   /* Allocate space for cj, initialize cj, and */
332   /* destroy list of free space and other temporary array(s) */
333   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
334   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
335   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
336 
337   /* Allocate space for ca */
338   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
339   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
340 
341   /* put together the new symbolic matrix */
342   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
343 
344   (*C)->rmap->bs = A->rmap->bs;
345   (*C)->cmap->bs = B->cmap->bs;
346 
347   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
348   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
349   c          = (Mat_SeqAIJ*)((*C)->data);
350   c->free_a  = PETSC_TRUE;
351   c->free_ij = PETSC_TRUE;
352   c->nonew   = 0;
353 
354   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
355 
356   /* set MatInfo */
357   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
358   if (afill < 1.0) afill = 1.0;
359   c->maxnz                     = ci[am];
360   c->nz                        = ci[am];
361   (*C)->info.mallocs           = ndouble;
362   (*C)->info.fill_ratio_given  = fill;
363   (*C)->info.fill_ratio_needed = afill;
364 
365 #if defined(PETSC_USE_INFO)
366   if (ci[am]) {
367     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
368     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
369   } else {
370     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
371   }
372 #endif
373   PetscFunctionReturn(0);
374 }
375 
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
379 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
380 {
381   PetscErrorCode     ierr;
382   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
383   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
384   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
385   MatScalar          *ca;
386   PetscReal          afill;
387   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
388   PetscFreeSpaceList free_space=NULL,current_space=NULL;
389 
390   PetscFunctionBegin;
391   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
392   /*---------------------------------------------------------------------------------------------*/
393   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
394   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
395   ci[0] = 0;
396 
397   /* create and initialize a linked list */
398   nlnk_max = a->rmax*b->rmax;
399   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
400   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
401 
402   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
403   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
404   current_space = free_space;
405 
406   /* Determine ci and cj */
407   for (i=0; i<am; i++) {
408     anzi = ai[i+1] - ai[i];
409     aj   = a->j + ai[i];
410     for (j=0; j<anzi; j++) {
411       brow = aj[j];
412       bnzj = bi[brow+1] - bi[brow];
413       bj   = b->j + bi[brow];
414       /* add non-zero cols of B into the sorted linked list lnk */
415       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
416     }
417     cnzi = lnk[0];
418 
419     /* If free space is not available, make more free space */
420     /* Double the amount of total space in the list */
421     if (current_space->local_remaining<cnzi) {
422       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
423       ndouble++;
424     }
425 
426     /* Copy data into free space, then initialize lnk */
427     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
428 
429     current_space->array           += cnzi;
430     current_space->local_used      += cnzi;
431     current_space->local_remaining -= cnzi;
432 
433     ci[i+1] = ci[i] + cnzi;
434   }
435 
436   /* Column indices are in the list of free space */
437   /* Allocate space for cj, initialize cj, and */
438   /* destroy list of free space and other temporary array(s) */
439   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
440   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
441   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
442 
443   /* Allocate space for ca */
444   /*-----------------------*/
445   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
446   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
447 
448   /* put together the new symbolic matrix */
449   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);CHKERRQ(ierr);
450 
451   (*C)->rmap->bs = A->rmap->bs;
452   (*C)->cmap->bs = B->cmap->bs;
453 
454   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
455   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
456   c          = (Mat_SeqAIJ*)((*C)->data);
457   c->free_a  = PETSC_TRUE;
458   c->free_ij = PETSC_TRUE;
459   c->nonew   = 0;
460 
461   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
462 
463   /* set MatInfo */
464   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
465   if (afill < 1.0) afill = 1.0;
466   c->maxnz                     = ci[am];
467   c->nz                        = ci[am];
468   (*C)->info.mallocs           = ndouble;
469   (*C)->info.fill_ratio_given  = fill;
470   (*C)->info.fill_ratio_needed = afill;
471 
472 #if defined(PETSC_USE_INFO)
473   if (ci[am]) {
474     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
475     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
476   } else {
477     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
478   }
479 #endif
480   PetscFunctionReturn(0);
481 }
482 
483 #undef __FUNCT__
484 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
485 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
486 {
487   PetscErrorCode     ierr;
488   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
489   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
490   PetscInt           *ci,*cj,*bb;
491   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
492   PetscReal          afill;
493   PetscInt           i,j,col,ndouble = 0;
494   PetscFreeSpaceList free_space=NULL,current_space=NULL;
495   PetscHeap          h;
496 
497   PetscFunctionBegin;
498   /* Get ci and cj - by merging sorted rows using a heap */
499   /*---------------------------------------------------------------------------------------------*/
500   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
501   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
502   ci[0] = 0;
503 
504   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
505   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
506   current_space = free_space;
507 
508   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
509   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
510 
511   /* Determine ci and cj */
512   for (i=0; i<am; i++) {
513     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
514     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
515     ci[i+1] = ci[i];
516     /* Populate the min heap */
517     for (j=0; j<anzi; j++) {
518       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
519       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
520         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
521       }
522     }
523     /* Pick off the min element, adding it to free space */
524     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
525     while (j >= 0) {
526       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
527         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
528         ndouble++;
529       }
530       *(current_space->array++) = col;
531       current_space->local_used++;
532       current_space->local_remaining--;
533       ci[i+1]++;
534 
535       /* stash if anything else remains in this row of B */
536       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
537       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
538         PetscInt j2,col2;
539         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
540         if (col2 != col) break;
541         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
542         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
543       }
544       /* Put any stashed elements back into the min heap */
545       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
546       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
547     }
548   }
549   ierr = PetscFree(bb);CHKERRQ(ierr);
550   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
551 
552   /* Column indices are in the list of free space */
553   /* Allocate space for cj, initialize cj, and */
554   /* destroy list of free space and other temporary array(s) */
555   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
556   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
557 
558   /* put together the new symbolic matrix */
559   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
560 
561   (*C)->rmap->bs = A->rmap->bs;
562   (*C)->cmap->bs = B->cmap->bs;
563 
564   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
565   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
566   c          = (Mat_SeqAIJ*)((*C)->data);
567   c->free_a  = PETSC_TRUE;
568   c->free_ij = PETSC_TRUE;
569   c->nonew   = 0;
570 
571   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
572 
573   /* set MatInfo */
574   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
575   if (afill < 1.0) afill = 1.0;
576   c->maxnz                     = ci[am];
577   c->nz                        = ci[am];
578   (*C)->info.mallocs           = ndouble;
579   (*C)->info.fill_ratio_given  = fill;
580   (*C)->info.fill_ratio_needed = afill;
581 
582 #if defined(PETSC_USE_INFO)
583   if (ci[am]) {
584     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
585     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
586   } else {
587     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
588   }
589 #endif
590   PetscFunctionReturn(0);
591 }
592 
593 #undef __FUNCT__
594 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap"
595 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
596 {
597   PetscErrorCode     ierr;
598   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
599   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
600   PetscInt           *ci,*cj,*bb;
601   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
602   PetscReal          afill;
603   PetscInt           i,j,col,ndouble = 0;
604   PetscFreeSpaceList free_space=NULL,current_space=NULL;
605   PetscHeap          h;
606   PetscBT            bt;
607 
608   PetscFunctionBegin;
609   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
610   /*---------------------------------------------------------------------------------------------*/
611   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
612   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
613   ci[0] = 0;
614 
615   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
616   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
617 
618   current_space = free_space;
619 
620   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
621   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
622   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
623 
624   /* Determine ci and cj */
625   for (i=0; i<am; i++) {
626     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
627     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
628     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
629     ci[i+1] = ci[i];
630     /* Populate the min heap */
631     for (j=0; j<anzi; j++) {
632       PetscInt brow = acol[j];
633       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
634         PetscInt bcol = bj[bb[j]];
635         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
636           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
637           bb[j]++;
638           break;
639         }
640       }
641     }
642     /* Pick off the min element, adding it to free space */
643     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
644     while (j >= 0) {
645       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
646         fptr = NULL;                      /* need PetscBTMemzero */
647         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
648         ndouble++;
649       }
650       *(current_space->array++) = col;
651       current_space->local_used++;
652       current_space->local_remaining--;
653       ci[i+1]++;
654 
655       /* stash if anything else remains in this row of B */
656       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
657         PetscInt bcol = bj[bb[j]];
658         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
659           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
660           bb[j]++;
661           break;
662         }
663       }
664       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
665     }
666     if (fptr) {                 /* Clear the bits for this row */
667       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
668     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
669       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
670     }
671   }
672   ierr = PetscFree(bb);CHKERRQ(ierr);
673   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
674   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
675 
676   /* Column indices are in the list of free space */
677   /* Allocate space for cj, initialize cj, and */
678   /* destroy list of free space and other temporary array(s) */
679   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
680   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
681 
682   /* put together the new symbolic matrix */
683   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
684 
685   (*C)->rmap->bs = A->rmap->bs;
686   (*C)->cmap->bs = B->cmap->bs;
687 
688   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
689   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
690   c          = (Mat_SeqAIJ*)((*C)->data);
691   c->free_a  = PETSC_TRUE;
692   c->free_ij = PETSC_TRUE;
693   c->nonew   = 0;
694 
695   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
696 
697   /* set MatInfo */
698   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
699   if (afill < 1.0) afill = 1.0;
700   c->maxnz                     = ci[am];
701   c->nz                        = ci[am];
702   (*C)->info.mallocs           = ndouble;
703   (*C)->info.fill_ratio_given  = fill;
704   (*C)->info.fill_ratio_needed = afill;
705 
706 #if defined(PETSC_USE_INFO)
707   if (ci[am]) {
708     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
709     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
710   } else {
711     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
712   }
713 #endif
714   PetscFunctionReturn(0);
715 }
716 
717 #undef __FUNCT__
718 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
719 /* concatenate unique entries and then sort */
720 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
721 {
722   PetscErrorCode     ierr;
723   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
724   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
725   PetscInt           *ci,*cj;
726   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
727   PetscReal          afill;
728   PetscInt           i,j,ndouble = 0;
729   PetscSegBuffer     seg,segrow;
730   char               *seen;
731 
732   PetscFunctionBegin;
733   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
734   ci[0] = 0;
735 
736   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
737   ierr = PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);CHKERRQ(ierr);
738   ierr = PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);CHKERRQ(ierr);
739   ierr = PetscMalloc(bn*sizeof(char),&seen);CHKERRQ(ierr);
740   ierr = PetscMemzero(seen,bn*sizeof(char));CHKERRQ(ierr);
741 
742   /* Determine ci and cj */
743   for (i=0; i<am; i++) {
744     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
745     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
746     PetscInt packlen = 0,*PETSC_RESTRICT crow;
747     /* Pack segrow */
748     for (j=0; j<anzi; j++) {
749       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
750       for (k=bjstart; k<bjend; k++) {
751         PetscInt bcol = bj[k];
752         if (!seen[bcol]) { /* new entry */
753           PetscInt *PETSC_RESTRICT slot;
754           ierr = PetscSegBufferGetInts(segrow,1,&slot);CHKERRQ(ierr);
755           *slot = bcol;
756           seen[bcol] = 1;
757           packlen++;
758         }
759       }
760     }
761     ierr = PetscSegBufferGetInts(seg,packlen,&crow);CHKERRQ(ierr);
762     ierr = PetscSegBufferExtractTo(segrow,crow);CHKERRQ(ierr);
763     ierr = PetscSortInt(packlen,crow);CHKERRQ(ierr);
764     ci[i+1] = ci[i] + packlen;
765     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
766   }
767   ierr = PetscSegBufferDestroy(&segrow);CHKERRQ(ierr);
768   ierr = PetscFree(seen);CHKERRQ(ierr);
769 
770   /* Column indices are in the segmented buffer */
771   ierr = PetscSegBufferExtractAlloc(seg,&cj);CHKERRQ(ierr);
772   ierr = PetscSegBufferDestroy(&seg);CHKERRQ(ierr);
773 
774   /* put together the new symbolic matrix */
775   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);CHKERRQ(ierr);
776 
777   (*C)->rmap->bs = A->rmap->bs;
778   (*C)->cmap->bs = B->cmap->bs;
779 
780   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
781   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
782   c          = (Mat_SeqAIJ*)((*C)->data);
783   c->free_a  = PETSC_TRUE;
784   c->free_ij = PETSC_TRUE;
785   c->nonew   = 0;
786 
787   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
788 
789   /* set MatInfo */
790   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
791   if (afill < 1.0) afill = 1.0;
792   c->maxnz                     = ci[am];
793   c->nz                        = ci[am];
794   (*C)->info.mallocs           = ndouble;
795   (*C)->info.fill_ratio_given  = fill;
796   (*C)->info.fill_ratio_needed = afill;
797 
798 #if defined(PETSC_USE_INFO)
799   if (ci[am]) {
800     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
801     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
802   } else {
803     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
804   }
805 #endif
806   PetscFunctionReturn(0);
807 }
808 
809 /* This routine is not used. Should be removed! */
810 #undef __FUNCT__
811 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
812 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
813 {
814   PetscErrorCode ierr;
815 
816   PetscFunctionBegin;
817   if (scall == MAT_INITIAL_MATRIX) {
818     ierr = PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
819     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
820     ierr = PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);CHKERRQ(ierr);
821   }
822   ierr = PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
823   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
824   ierr = PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);CHKERRQ(ierr);
825   PetscFunctionReturn(0);
826 }
827 
828 #undef __FUNCT__
829 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
830 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
831 {
832   PetscErrorCode      ierr;
833   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
834 
835   PetscFunctionBegin;
836   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
837   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
838   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
839   ierr = PetscFree(multtrans);CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 
843 #undef __FUNCT__
844 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
845 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
846 {
847   PetscErrorCode      ierr;
848   PetscContainer      container;
849   Mat_MatMatTransMult *multtrans=NULL;
850 
851   PetscFunctionBegin;
852   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject*)&container);CHKERRQ(ierr);
853   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
854   ierr = PetscContainerGetPointer(container,(void**)&multtrans);CHKERRQ(ierr);
855 
856   A->ops->destroy = multtrans->destroy;
857   if (A->ops->destroy) {
858     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
859   }
860   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
861   PetscFunctionReturn(0);
862 }
863 
864 #undef __FUNCT__
865 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
866 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
867 {
868   PetscErrorCode      ierr;
869   Mat                 Bt;
870   PetscInt            *bti,*btj;
871   Mat_MatMatTransMult *multtrans;
872   PetscContainer      container;
873 
874   PetscFunctionBegin;
875   /* create symbolic Bt */
876   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
877   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);CHKERRQ(ierr);
878 
879   Bt->rmap->bs = A->cmap->bs;
880   Bt->cmap->bs = B->cmap->bs;
881 
882   /* get symbolic C=A*Bt */
883   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
884 
885   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
886   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
887 
888   /* attach the supporting struct to C */
889   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
890   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
891   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
892   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
893   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
894 
895   multtrans->usecoloring = PETSC_FALSE;
896   multtrans->destroy     = (*C)->ops->destroy;
897   (*C)->ops->destroy     = MatDestroy_SeqAIJ_MatMatMultTrans;
898 
899   ierr = PetscOptionsGetBool(NULL,"-matmattransmult_color",&multtrans->usecoloring,NULL);CHKERRQ(ierr);
900   if (multtrans->usecoloring) {
901     /* Create MatTransposeColoring from symbolic C=A*B^T */
902     MatTransposeColoring matcoloring;
903     ISColoring           iscoloring;
904     Mat                  Bt_dense,C_dense;
905 
906     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
907     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
908 
909     multtrans->matcoloring = matcoloring;
910 
911     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
912 
913     /* Create Bt_dense and C_dense = A*Bt_dense */
914     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
915     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
916     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
917     ierr = MatSeqDenseSetPreallocation(Bt_dense,NULL);CHKERRQ(ierr);
918 
919     Bt_dense->assembled = PETSC_TRUE;
920     multtrans->Bt_den   = Bt_dense;
921 
922     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
923     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
924     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
925     ierr = MatSeqDenseSetPreallocation(C_dense,NULL);CHKERRQ(ierr);
926 
927     Bt_dense->assembled = PETSC_TRUE;
928     multtrans->ABt_den  = C_dense;
929 
930 #if defined(PETSC_USE_INFO)
931     {
932       Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
933       ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
934     }
935 #endif
936   }
937   /* clean up */
938   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
939   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
940 
941 
942 
943 #if defined(INEFFICIENT_ALGORITHM)
944   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
945   PetscFreeSpaceList free_space=NULL,current_space=NULL;
946   Mat_SeqAIJ         *a        =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
947   PetscInt           *ai       =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
948   PetscInt           am        =A->rmap->N,bm=B->rmap->N;
949   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
950   MatScalar          *ca;
951   PetscBT            lnkbt;
952   PetscReal          afill;
953 
954   /* Allocate row pointer array ci  */
955   ierr  = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
956   ci[0] = 0;
957 
958   /* Create and initialize a linked list for C columns */
959   nlnk = bm+1;
960   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
961 
962   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
963   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
964   current_space = free_space;
965 
966   /* Determine symbolic info for each row of the product A*B^T: */
967   for (i=0; i<am; i++) {
968     anzi = ai[i+1] - ai[i];
969     cnzi = 0;
970     acol = aj + ai[i];
971     for (j=0; j<bm; j++) {
972       bnzj = bi[j+1] - bi[j];
973       bcol = bj + bi[j];
974       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
975       ka = 0; kb = 0;
976       while (ka < anzi && kb < bnzj) {
977         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
978         if (ka == anzi) break;
979         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
980         if (kb == bnzj) break;
981         if (acol[ka] == bcol[kb]) { /* add nonzero c(i,j) to lnk */
982           index[0] = j;
983           ierr     = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
984           cnzi++;
985           break;
986         }
987       }
988     }
989 
990     /* If free space is not available, make more free space */
991     /* Double the amount of total space in the list */
992     if (current_space->local_remaining<cnzi) {
993       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
994       nspacedouble++;
995     }
996 
997     /* Copy data into free space, then initialize lnk */
998     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
999 
1000     current_space->array           += cnzi;
1001     current_space->local_used      += cnzi;
1002     current_space->local_remaining -= cnzi;
1003 
1004     ci[i+1] = ci[i] + cnzi;
1005   }
1006 
1007 
1008   /* Column indices are in the list of free space.
1009      Allocate array cj, copy column indices to cj, and destroy list of free space */
1010   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1011   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1012   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1013 
1014   /* Allocate space for ca */
1015   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1016   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1017 
1018   /* put together the new symbolic matrix */
1019   ierr = MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bm,ci,cj,ca,C);CHKERRQ(ierr);
1020 
1021   (*C)->rmap->bs = A->cmap->bs;
1022   (*C)->cmap->bs = B->cmap->bs;
1023 
1024   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1025   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1026   c          = (Mat_SeqAIJ*)((*C)->data);
1027   c->free_a  = PETSC_TRUE;
1028   c->free_ij = PETSC_TRUE;
1029   c->nonew   = 0;
1030 
1031   /* set MatInfo */
1032   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1033   if (afill < 1.0) afill = 1.0;
1034   c->maxnz                     = ci[am];
1035   c->nz                        = ci[am];
1036   (*C)->info.mallocs           = nspacedouble;
1037   (*C)->info.fill_ratio_given  = fill;
1038   (*C)->info.fill_ratio_needed = afill;
1039 
1040 #if defined(PETSC_USE_INFO)
1041   if (ci[am]) {
1042     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1043     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
1044   } else {
1045     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
1046   }
1047 #endif
1048 #endif
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
1053 #undef __FUNCT__
1054 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
1055 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1056 {
1057   PetscErrorCode      ierr;
1058   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1059   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1060   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1061   PetscLogDouble      flops=0.0;
1062   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1063   Mat_MatMatTransMult *multtrans;
1064   PetscContainer      container;
1065 #if defined(USE_ARRAY)
1066   MatScalar *spdot;
1067 #endif
1068 
1069   PetscFunctionBegin;
1070   /* clear old values in C */
1071   if (!c->a) {
1072     ierr      = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1073     c->a      = ca;
1074     c->free_a = PETSC_TRUE;
1075   } else {
1076     ca =  c->a;
1077   }
1078   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1079 
1080   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject*)&container);CHKERRQ(ierr);
1081   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
1082   ierr = PetscContainerGetPointer(container,(void**)&multtrans);CHKERRQ(ierr);
1083   if (multtrans->usecoloring) {
1084     MatTransposeColoring matcoloring = multtrans->matcoloring;
1085     Mat                  Bt_dense;
1086     PetscInt             m,n;
1087     Mat                  C_dense = multtrans->ABt_den;
1088 
1089     Bt_dense = multtrans->Bt_den;
1090     ierr     = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
1091 
1092     /* Get Bt_dense by Apply MatTransposeColoring to B */
1093     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
1094 
1095     /* C_dense = A*Bt_dense */
1096     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
1097 
1098     /* Recover C from C_dense */
1099     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
1100     PetscFunctionReturn(0);
1101   }
1102 
1103 #if defined(USE_ARRAY)
1104   /* allocate an array for implementing sparse inner-product */
1105   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
1106   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
1107 #endif
1108 
1109   for (i=0; i<cm; i++) {
1110     anzi = ai[i+1] - ai[i];
1111     acol = aj + ai[i];
1112     aval = aa + ai[i];
1113     cnzi = ci[i+1] - ci[i];
1114     ccol = cj + ci[i];
1115     cval = ca + ci[i];
1116     for (j=0; j<cnzi; j++) {
1117       brow = ccol[j];
1118       bnzj = bi[brow+1] - bi[brow];
1119       bcol = bj + bi[brow];
1120       bval = ba + bi[brow];
1121 
1122       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1123 #if defined(USE_ARRAY)
1124       /* put ba to spdot array */
1125       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
1126       /* c(i,j)=A[i,:]*B[j,:]^T */
1127       for (nexta=0; nexta<anzi; nexta++) {
1128         cval[j] += spdot[acol[nexta]]*aval[nexta];
1129       }
1130       /* zero spdot array */
1131       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
1132 #else
1133       nexta = 0; nextb = 0;
1134       while (nexta<anzi && nextb<bnzj) {
1135         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1136         if (nexta == anzi) break;
1137         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1138         if (nextb == bnzj) break;
1139         if (acol[nexta] == bcol[nextb]) {
1140           cval[j] += aval[nexta]*bval[nextb];
1141           nexta++; nextb++;
1142           flops += 2;
1143         }
1144       }
1145 #endif
1146     }
1147   }
1148   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1149   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1150   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1151 #if defined(USE_ARRAY)
1152   ierr = PetscFree(spdot);
1153 #endif
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 #undef __FUNCT__
1158 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
1159 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1160 {
1161   PetscErrorCode ierr;
1162 
1163   PetscFunctionBegin;
1164   if (scall == MAT_INITIAL_MATRIX) {
1165     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1166   }
1167   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 #undef __FUNCT__
1172 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
1173 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1174 {
1175   PetscErrorCode ierr;
1176   Mat            At;
1177   PetscInt       *ati,*atj;
1178 
1179   PetscFunctionBegin;
1180   /* create symbolic At */
1181   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1182   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);CHKERRQ(ierr);
1183 
1184   At->rmap->bs = A->cmap->bs;
1185   At->cmap->bs = B->cmap->bs;
1186 
1187   /* get symbolic C=At*B */
1188   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1189 
1190   /* clean up */
1191   ierr = MatDestroy(&At);CHKERRQ(ierr);
1192   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 #undef __FUNCT__
1197 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
1198 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1199 {
1200   PetscErrorCode ierr;
1201   Mat_SeqAIJ     *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1202   PetscInt       am   =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1203   PetscInt       cm   =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1204   PetscLogDouble flops=0.0;
1205   MatScalar      *aa  =a->a,*ba,*ca,*caj;
1206 
1207   PetscFunctionBegin;
1208   if (!c->a) {
1209     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1210 
1211     c->a      = ca;
1212     c->free_a = PETSC_TRUE;
1213   } else {
1214     ca = c->a;
1215   }
1216   /* clear old values in C */
1217   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1218 
1219   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1220   for (i=0; i<am; i++) {
1221     bj   = b->j + bi[i];
1222     ba   = b->a + bi[i];
1223     bnzi = bi[i+1] - bi[i];
1224     anzi = ai[i+1] - ai[i];
1225     for (j=0; j<anzi; j++) {
1226       nextb = 0;
1227       crow  = *aj++;
1228       cjj   = cj + ci[crow];
1229       caj   = ca + ci[crow];
1230       /* perform sparse axpy operation.  Note cjj includes bj. */
1231       for (k=0; nextb<bnzi; k++) {
1232         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1233           caj[k] += (*aa)*(*(ba+nextb));
1234           nextb++;
1235         }
1236       }
1237       flops += 2*bnzi;
1238       aa++;
1239     }
1240   }
1241 
1242   /* Assemble the final matrix and clean up */
1243   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1244   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1245   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 #undef __FUNCT__
1250 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1251 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1252 {
1253   PetscErrorCode ierr;
1254 
1255   PetscFunctionBegin;
1256   if (scall == MAT_INITIAL_MATRIX) {
1257     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1258     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1259     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
1260   }
1261   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1262   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1263   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #undef __FUNCT__
1268 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1269 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1270 {
1271   PetscErrorCode ierr;
1272 
1273   PetscFunctionBegin;
1274   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1275 
1276   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNCT__
1281 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1282 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1283 {
1284   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1285   PetscErrorCode ierr;
1286   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1287   MatScalar      *aa;
1288   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1289   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1290 
1291   PetscFunctionBegin;
1292   if (!cm || !cn) PetscFunctionReturn(0);
1293   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1294   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1295   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1296   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1297   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1298   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1299   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1300     colam = col*am;
1301     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1302       r1 = r2 = r3 = r4 = 0.0;
1303       n  = a->i[i+1] - a->i[i];
1304       aj = a->j + a->i[i];
1305       aa = a->a + a->i[i];
1306       for (j=0; j<n; j++) {
1307         r1 += (*aa)*b1[*aj];
1308         r2 += (*aa)*b2[*aj];
1309         r3 += (*aa)*b3[*aj];
1310         r4 += (*aa++)*b4[*aj++];
1311       }
1312       c[colam + i]       = r1;
1313       c[colam + am + i]  = r2;
1314       c[colam + am2 + i] = r3;
1315       c[colam + am3 + i] = r4;
1316     }
1317     b1 += bm4;
1318     b2 += bm4;
1319     b3 += bm4;
1320     b4 += bm4;
1321   }
1322   for (; col<cn; col++) {     /* over extra columns of C */
1323     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1324       r1 = 0.0;
1325       n  = a->i[i+1] - a->i[i];
1326       aj = a->j + a->i[i];
1327       aa = a->a + a->i[i];
1328 
1329       for (j=0; j<n; j++) {
1330         r1 += (*aa++)*b1[*aj++];
1331       }
1332       c[col*am + i] = r1;
1333     }
1334     b1 += bm;
1335   }
1336   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1337   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1338   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1339   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1340   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /*
1345    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1346 */
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1349 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1350 {
1351   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1352   PetscErrorCode ierr;
1353   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1354   MatScalar      *aa;
1355   PetscInt       cm  = C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1356   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1357 
1358   PetscFunctionBegin;
1359   if (!cm || !cn) PetscFunctionReturn(0);
1360   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1361   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1362   b1   = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1363 
1364   if (a->compressedrow.use) { /* use compressed row format */
1365     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1366       colam = col*am;
1367       arm   = a->compressedrow.nrows;
1368       ii    = a->compressedrow.i;
1369       ridx  = a->compressedrow.rindex;
1370       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1371         r1 = r2 = r3 = r4 = 0.0;
1372         n  = ii[i+1] - ii[i];
1373         aj = a->j + ii[i];
1374         aa = a->a + ii[i];
1375         for (j=0; j<n; j++) {
1376           r1 += (*aa)*b1[*aj];
1377           r2 += (*aa)*b2[*aj];
1378           r3 += (*aa)*b3[*aj];
1379           r4 += (*aa++)*b4[*aj++];
1380         }
1381         c[colam       + ridx[i]] += r1;
1382         c[colam + am  + ridx[i]] += r2;
1383         c[colam + am2 + ridx[i]] += r3;
1384         c[colam + am3 + ridx[i]] += r4;
1385       }
1386       b1 += bm4;
1387       b2 += bm4;
1388       b3 += bm4;
1389       b4 += bm4;
1390     }
1391     for (; col<cn; col++) {     /* over extra columns of C */
1392       colam = col*am;
1393       arm   = a->compressedrow.nrows;
1394       ii    = a->compressedrow.i;
1395       ridx  = a->compressedrow.rindex;
1396       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1397         r1 = 0.0;
1398         n  = ii[i+1] - ii[i];
1399         aj = a->j + ii[i];
1400         aa = a->a + ii[i];
1401 
1402         for (j=0; j<n; j++) {
1403           r1 += (*aa++)*b1[*aj++];
1404         }
1405         c[colam + ridx[i]] += r1;
1406       }
1407       b1 += bm;
1408     }
1409   } else {
1410     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1411       colam = col*am;
1412       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1413         r1 = r2 = r3 = r4 = 0.0;
1414         n  = a->i[i+1] - a->i[i];
1415         aj = a->j + a->i[i];
1416         aa = a->a + a->i[i];
1417         for (j=0; j<n; j++) {
1418           r1 += (*aa)*b1[*aj];
1419           r2 += (*aa)*b2[*aj];
1420           r3 += (*aa)*b3[*aj];
1421           r4 += (*aa++)*b4[*aj++];
1422         }
1423         c[colam + i]       += r1;
1424         c[colam + am + i]  += r2;
1425         c[colam + am2 + i] += r3;
1426         c[colam + am3 + i] += r4;
1427       }
1428       b1 += bm4;
1429       b2 += bm4;
1430       b3 += bm4;
1431       b4 += bm4;
1432     }
1433     for (; col<cn; col++) {     /* over extra columns of C */
1434       colam = col*am;
1435       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1436         r1 = 0.0;
1437         n  = a->i[i+1] - a->i[i];
1438         aj = a->j + a->i[i];
1439         aa = a->a + a->i[i];
1440 
1441         for (j=0; j<n; j++) {
1442           r1 += (*aa++)*b1[*aj++];
1443         }
1444         c[colam + i] += r1;
1445       }
1446       b1 += bm;
1447     }
1448   }
1449   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1450   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1451   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 #undef __FUNCT__
1456 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1457 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1458 {
1459   PetscErrorCode ierr;
1460   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1461   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1462   PetscInt       *bi      = b->i,*bj=b->j;
1463   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1464   MatScalar      *btval,*btval_den,*ba=b->a;
1465   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1466 
1467   PetscFunctionBegin;
1468   btval_den=btdense->v;
1469   ierr     = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1470   for (k=0; k<ncolors; k++) {
1471     ncolumns = coloring->ncolumns[k];
1472     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1473       col   = *(columns + colorforcol[k] + l);
1474       btcol = bj + bi[col];
1475       btval = ba + bi[col];
1476       anz   = bi[col+1] - bi[col];
1477       for (j=0; j<anz; j++) {
1478         brow            = btcol[j];
1479         btval_den[brow] = btval[j];
1480       }
1481     }
1482     btval_den += m;
1483   }
1484   PetscFunctionReturn(0);
1485 }
1486 
1487 #undef __FUNCT__
1488 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1489 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1490 {
1491   PetscErrorCode ierr;
1492   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1493   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1494   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1495   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1496 
1497   PetscFunctionBegin;
1498   ierr   = MatGetLocalSize(Csp,&m,NULL);CHKERRQ(ierr);
1499   ierr   = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1500   cp_den = ca_den;
1501   for (k=0; k<ncolors; k++) {
1502     nrows = matcoloring->nrows[k];
1503     row   = rows  + colorforrow[k];
1504     idx   = spidx + colorforrow[k];
1505     for (l=0; l<nrows; l++) {
1506       ca[idx[l]] = cp_den[row[l]];
1507     }
1508     cp_den += m;
1509   }
1510   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 /*
1515  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1516  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1517  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1518  */
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1521 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1522 {
1523   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1524   PetscErrorCode ierr;
1525   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1526   PetscInt       nz = a->i[m],row,*jj,mr,col;
1527   PetscInt       *cspidx;
1528 
1529   PetscFunctionBegin;
1530   *nn = n;
1531   if (!ia) PetscFunctionReturn(0);
1532   if (symmetric) {
1533     SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1534     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,(PetscInt**)ia,(PetscInt**)ja);CHKERRQ(ierr);
1535   } else {
1536     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1537     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1538     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1539     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1540     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1541     jj   = a->j;
1542     for (i=0; i<nz; i++) {
1543       collengths[jj[i]]++;
1544     }
1545     cia[0] = oshift;
1546     for (i=0; i<n; i++) {
1547       cia[i+1] = cia[i] + collengths[i];
1548     }
1549     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1550     jj   = a->j;
1551     for (row=0; row<m; row++) {
1552       mr = a->i[row+1] - a->i[row];
1553       for (i=0; i<mr; i++) {
1554         col = *jj++;
1555 
1556         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1557         cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
1558       }
1559     }
1560     ierr   = PetscFree(collengths);CHKERRQ(ierr);
1561     *ia    = cia; *ja = cja;
1562     *spidx = cspidx;
1563   }
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 #undef __FUNCT__
1568 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1569 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1570 {
1571   PetscErrorCode ierr;
1572 
1573   PetscFunctionBegin;
1574   if (!ia) PetscFunctionReturn(0);
1575 
1576   ierr = PetscFree(*ia);CHKERRQ(ierr);
1577   ierr = PetscFree(*ja);CHKERRQ(ierr);
1578   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 #undef __FUNCT__
1583 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1584 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1585 {
1586   PetscErrorCode ierr;
1587   PetscInt       i,n,nrows,N,j,k,m,ncols,col,cm;
1588   const PetscInt *is,*ci,*cj,*row_idx;
1589   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1590   IS             *isa;
1591   PetscBool      flg1,flg2;
1592   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1593   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1594   PetscInt       *colorforcol,*columns,*columns_i;
1595 
1596   PetscFunctionBegin;
1597   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1598 
1599   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1600   ierr = PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1601   ierr = PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1602   if (flg1 || flg2) {
1603     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1604   }
1605 
1606   N         = mat->cmap->N/bs;
1607   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1608   c->N      = mat->cmap->N/bs;
1609   c->m      = mat->rmap->N/bs;
1610   c->rstart = 0;
1611 
1612   c->ncolors = nis;
1613   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1614   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1615   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1616   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1617 
1618   colorforrow[0]    = 0;
1619   rows_i            = rows;
1620   columnsforspidx_i = columnsforspidx;
1621 
1622   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1623   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1624 
1625   colorforcol[0] = 0;
1626   columns_i      = columns;
1627 
1628   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr); /* column-wise storage of mat */
1629 
1630   cm   = c->m;
1631   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1632   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1633   for (i=0; i<nis; i++) {
1634     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1635     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1636 
1637     c->ncolumns[i] = n;
1638     if (n) {
1639       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1640     }
1641     colorforcol[i+1] = colorforcol[i] + n;
1642     columns_i       += n;
1643 
1644     /* fast, crude version requires O(N*N) work */
1645     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1646 
1647     /* loop over columns*/
1648     for (j=0; j<n; j++) {
1649       col     = is[j];
1650       row_idx = cj + ci[col];
1651       m       = ci[col+1] - ci[col];
1652       /* loop over columns marking them in rowhit */
1653       for (k=0; k<m; k++) {
1654         idxhit[*row_idx]   = spidx[ci[col] + k];
1655         rowhit[*row_idx++] = col + 1;
1656       }
1657     }
1658     /* count the number of hits */
1659     nrows = 0;
1660     for (j=0; j<cm; j++) {
1661       if (rowhit[j]) nrows++;
1662     }
1663     c->nrows[i]      = nrows;
1664     colorforrow[i+1] = colorforrow[i] + nrows;
1665 
1666     nrows = 0;
1667     for (j=0; j<cm; j++) {
1668       if (rowhit[j]) {
1669         rows_i[nrows]            = j;
1670         columnsforspidx_i[nrows] = idxhit[j];
1671         nrows++;
1672       }
1673     }
1674     ierr    = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1675     rows_i += nrows; columnsforspidx_i += nrows;
1676   }
1677   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);CHKERRQ(ierr);
1678   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1679   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1680 #if defined(PETSC_USE_DEBUG)
1681   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1682 #endif
1683 
1684   c->colorforrow     = colorforrow;
1685   c->rows            = rows;
1686   c->columnsforspidx = columnsforspidx;
1687   c->colorforcol     = colorforcol;
1688   c->columns         = columns;
1689 
1690   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693