xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision bf388a1f1fb7cbb324cb4eba80ecc0a331d4a847)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
15 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
16 {
17   PetscErrorCode ierr;
18   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE,heap = PETSC_FALSE,btheap = PETSC_FALSE;
19 
20   PetscFunctionBegin;
21   if (scall == MAT_INITIAL_MATRIX){
22     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
23     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
24     ierr = PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,PETSC_NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_heap","Use heap implementation of symbolic factorization C=A*B","",heap,&heap,PETSC_NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_btheap","Use btheap implementation of symbolic factorization C=A*B","",btheap,&btheap,PETSC_NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsEnd();CHKERRQ(ierr);
28     if (scalable_fast){
29       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
30     } else if (scalable){
31       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
32     } else if (heap) {
33       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
34     } else if (btheap) {
35       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
36     } else {
37       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
38     }
39   }
40 
41   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
42   PetscFunctionReturn(0);
43 }
44 
45 #undef __FUNCT__
46 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
47 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
48 {
49   PetscErrorCode     ierr;
50   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
51   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
52   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
53   PetscReal          afill;
54   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
55   PetscBT            lnkbt;
56   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
57 
58   PetscFunctionBegin;
59   /* Get ci and cj */
60   /*---------------*/
61   /* Allocate ci array, arrays for fill computation and */
62   /* free space for accumulating nonzero column info */
63   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
64   ci[0] = 0;
65 
66   /* create and initialize a linked list */
67   nlnk_max = a->rmax*b->rmax;
68   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
69   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
70 
71   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
72   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
73   current_space = free_space;
74 
75   /* Determine ci and cj */
76   for (i=0; i<am; i++) {
77     anzi = ai[i+1] - ai[i];
78     aj   = a->j + ai[i];
79     for (j=0; j<anzi; j++){
80       brow = aj[j];
81       bnzj = bi[brow+1] - bi[brow];
82       bj   = b->j + bi[brow];
83       /* add non-zero cols of B into the sorted linked list lnk */
84       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
85     }
86     cnzi = lnk[0];
87 
88     /* If free space is not available, make more free space */
89     /* Double the amount of total space in the list */
90     if (current_space->local_remaining<cnzi) {
91       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
92       ndouble++;
93     }
94 
95     /* Copy data into free space, then initialize lnk */
96     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
97     current_space->array           += cnzi;
98     current_space->local_used      += cnzi;
99     current_space->local_remaining -= cnzi;
100     ci[i+1] = ci[i] + cnzi;
101   }
102 
103   /* Column indices are in the list of free space */
104   /* Allocate space for cj, initialize cj, and */
105   /* destroy list of free space and other temporary array(s) */
106   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
107   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
108   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
109 
110   /* put together the new symbolic matrix */
111   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
112   (*C)->rmap->bs = A->rmap->bs;
113   (*C)->cmap->bs = B->cmap->bs;
114 
115   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
116   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
117   c = (Mat_SeqAIJ *)((*C)->data);
118   c->free_a  = PETSC_FALSE;
119   c->free_ij = PETSC_TRUE;
120   c->nonew   = 0;
121   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
122 
123   /* set MatInfo */
124   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
125   if (afill < 1.0) afill = 1.0;
126   c->maxnz                     = ci[am];
127   c->nz                        = ci[am];
128   (*C)->info.mallocs           = ndouble;
129   (*C)->info.fill_ratio_given  = fill;
130   (*C)->info.fill_ratio_needed = afill;
131 
132 #if defined(PETSC_USE_INFO)
133   if (ci[am]) {
134     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
135     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
136   } else {
137     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
138   }
139 #endif
140   PetscFunctionReturn(0);
141 }
142 
143 #undef __FUNCT__
144 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
145 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
146 {
147   PetscErrorCode ierr;
148   PetscLogDouble flops=0.0;
149   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
150   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
151   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
152   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
153   PetscInt       am=A->rmap->n,cm=C->rmap->n;
154   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
155   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
156   PetscScalar    *ab_dense;
157 
158   PetscFunctionBegin;
159   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
160   if (!c->a){ /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
161     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
162     c->a      = ca;
163     c->free_a = PETSC_TRUE;
164 
165     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
166     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
167     c->matmult_abdense = ab_dense;
168   } else {
169     ca       = c->a;
170     ab_dense = c->matmult_abdense;
171   }
172 
173   /* clean old values in C */
174   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
175   /* Traverse A row-wise. */
176   /* Build the ith row in C by summing over nonzero columns in A, */
177   /* the rows of B corresponding to nonzeros of A. */
178   for (i=0; i<am; i++) {
179     anzi = ai[i+1] - ai[i];
180     for (j=0; j<anzi; j++) {
181       brow = aj[j];
182       bnzi = bi[brow+1] - bi[brow];
183       bjj  = bj + bi[brow];
184       baj  = ba + bi[brow];
185       /* perform dense axpy */
186       valtmp = aa[j];
187       for (k=0; k<bnzi; k++) {
188         ab_dense[bjj[k]] += valtmp*baj[k];
189       }
190       flops += 2*bnzi;
191     }
192     aj += anzi; aa += anzi;
193 
194     cnzi = ci[i+1] - ci[i];
195     for (k=0; k<cnzi; k++) {
196       ca[k]          += ab_dense[cj[k]];
197       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
198     }
199     flops += cnzi;
200     cj += cnzi; ca += cnzi;
201   }
202   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
204   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
210 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
211 {
212   PetscErrorCode ierr;
213   PetscLogDouble flops=0.0;
214   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
215   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
216   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
217   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
218   PetscInt       am=A->rmap->N,cm=C->rmap->N;
219   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
220   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
221   PetscInt       nextb;
222 
223   PetscFunctionBegin;
224   /* clean old values in C */
225   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
226   /* Traverse A row-wise. */
227   /* Build the ith row in C by summing over nonzero columns in A, */
228   /* the rows of B corresponding to nonzeros of A. */
229   for (i=0;i<am;i++) {
230     anzi = ai[i+1] - ai[i];
231     cnzi = ci[i+1] - ci[i];
232     for (j=0;j<anzi;j++) {
233       brow = aj[j];
234       bnzi = bi[brow+1] - bi[brow];
235       bjj  = bj + bi[brow];
236       baj  = ba + bi[brow];
237       /* perform sparse axpy */
238       valtmp = aa[j];
239       nextb  = 0;
240       for (k=0; nextb<bnzi; k++) {
241         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
242           ca[k] += valtmp*baj[nextb++];
243         }
244       }
245       flops += 2*bnzi;
246     }
247     aj += anzi; aa += anzi;
248     cj += cnzi; ca += cnzi;
249   }
250 
251   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
252   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
253   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 #undef __FUNCT__
258 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
259 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
260 {
261   PetscErrorCode     ierr;
262   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
263   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
264   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
265   MatScalar          *ca;
266   PetscReal          afill;
267   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
268   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
269 
270   PetscFunctionBegin;
271   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
272   /*-----------------------------------------------------------------------------------------*/
273   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
274   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
275   ci[0] = 0;
276 
277   /* create and initialize a linked list */
278   nlnk_max = a->rmax*b->rmax;
279   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
280   ierr = PetscLLCondensedCreate_fast(nlnk_max,&lnk);CHKERRQ(ierr);
281 
282   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
283   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
284   current_space = free_space;
285 
286   /* Determine ci and cj */
287   for (i=0; i<am; i++) {
288     anzi = ai[i+1] - ai[i];
289     aj   = a->j + ai[i];
290     for (j=0; j<anzi; j++){
291       brow = aj[j];
292       bnzj = bi[brow+1] - bi[brow];
293       bj   = b->j + bi[brow];
294       /* add non-zero cols of B into the sorted linked list lnk */
295       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
296     }
297     cnzi = lnk[1];
298 
299     /* If free space is not available, make more free space */
300     /* Double the amount of total space in the list */
301     if (current_space->local_remaining<cnzi) {
302       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
303       ndouble++;
304     }
305 
306     /* Copy data into free space, then initialize lnk */
307     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
308     current_space->array           += cnzi;
309     current_space->local_used      += cnzi;
310     current_space->local_remaining -= cnzi;
311     ci[i+1] = ci[i] + cnzi;
312   }
313 
314   /* Column indices are in the list of free space */
315   /* Allocate space for cj, initialize cj, and */
316   /* destroy list of free space and other temporary array(s) */
317   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
318   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
319   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
320 
321   /* Allocate space for ca */
322   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
323   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
324 
325   /* put together the new symbolic matrix */
326   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
327   (*C)->rmap->bs = A->rmap->bs;
328   (*C)->cmap->bs = B->cmap->bs;
329 
330   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
331   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
332   c = (Mat_SeqAIJ *)((*C)->data);
333   c->free_a   = PETSC_TRUE;
334   c->free_ij  = PETSC_TRUE;
335   c->nonew    = 0;
336   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
337 
338   /* set MatInfo */
339   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
340   if (afill < 1.0) afill = 1.0;
341   c->maxnz                     = ci[am];
342   c->nz                        = ci[am];
343   (*C)->info.mallocs           = ndouble;
344   (*C)->info.fill_ratio_given  = fill;
345   (*C)->info.fill_ratio_needed = afill;
346 
347 #if defined(PETSC_USE_INFO)
348   if (ci[am]) {
349     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
350     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
351   } else {
352     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
353   }
354 #endif
355   PetscFunctionReturn(0);
356 }
357 
358 
359 #undef __FUNCT__
360 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
361 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
362 {
363   PetscErrorCode     ierr;
364   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
365   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
366   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
367   MatScalar          *ca;
368   PetscReal          afill;
369   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
370   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
371 
372   PetscFunctionBegin;
373   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
374   /*---------------------------------------------------------------------------------------------*/
375   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
376   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
377   ci[0] = 0;
378 
379   /* create and initialize a linked list */
380   nlnk_max = a->rmax*b->rmax;
381   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
382   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
383 
384   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
385   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
386   current_space = free_space;
387 
388   /* Determine ci and cj */
389   for (i=0; i<am; i++) {
390     anzi = ai[i+1] - ai[i];
391     aj   = a->j + ai[i];
392     for (j=0; j<anzi; j++){
393       brow = aj[j];
394       bnzj = bi[brow+1] - bi[brow];
395       bj   = b->j + bi[brow];
396       /* add non-zero cols of B into the sorted linked list lnk */
397       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
398     }
399     cnzi = lnk[0];
400 
401     /* If free space is not available, make more free space */
402     /* Double the amount of total space in the list */
403     if (current_space->local_remaining<cnzi) {
404       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
405       ndouble++;
406     }
407 
408     /* Copy data into free space, then initialize lnk */
409     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
410     current_space->array           += cnzi;
411     current_space->local_used      += cnzi;
412     current_space->local_remaining -= cnzi;
413     ci[i+1] = ci[i] + cnzi;
414   }
415 
416   /* Column indices are in the list of free space */
417   /* Allocate space for cj, initialize cj, and */
418   /* destroy list of free space and other temporary array(s) */
419   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
420   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
421   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
422 
423   /* Allocate space for ca */
424   /*-----------------------*/
425   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
426   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
427 
428   /* put together the new symbolic matrix */
429   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
430   (*C)->rmap->bs = A->rmap->bs;
431   (*C)->cmap->bs = B->cmap->bs;
432 
433   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
434   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
435   c = (Mat_SeqAIJ *)((*C)->data);
436   c->free_a   = PETSC_TRUE;
437   c->free_ij  = PETSC_TRUE;
438   c->nonew    = 0;
439   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
440 
441   /* set MatInfo */
442   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
443   if (afill < 1.0) afill = 1.0;
444   c->maxnz                     = ci[am];
445   c->nz                        = ci[am];
446   (*C)->info.mallocs           = ndouble;
447   (*C)->info.fill_ratio_given  = fill;
448   (*C)->info.fill_ratio_needed = afill;
449 
450 #if defined(PETSC_USE_INFO)
451   if (ci[am]) {
452     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
453     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
454   } else {
455     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
456   }
457 #endif
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
463 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
464 {
465   PetscErrorCode     ierr;
466   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
467   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
468   PetscInt           *ci,*cj,*bb;
469   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
470   PetscReal          afill;
471   PetscInt           i,j,col,ndouble = 0;
472   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
473   PetscHeap          h;
474 
475   PetscFunctionBegin;
476   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
477   /*---------------------------------------------------------------------------------------------*/
478   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
479   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
480   ci[0] = 0;
481 
482   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
483   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
484   current_space = free_space;
485 
486   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
487   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
488 
489   /* Determine ci and cj */
490   for (i=0; i<am; i++) {
491     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
492     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
493     ci[i+1] = ci[i];
494     /* Populate the min heap */
495     for (j=0; j<anzi; j++) {
496       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
497       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
498         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
499       }
500     }
501     /* Pick off the min element, adding it to free space */
502     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
503     while (j >= 0) {
504       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
505         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
506         ndouble++;
507       }
508       *(current_space->array++) = col;
509       current_space->local_used++;
510       current_space->local_remaining--;
511       ci[i+1]++;
512 
513       /* stash if anything else remains in this row of B */
514       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
515       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
516         PetscInt j2,col2;
517         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
518         if (col2 != col) break;
519         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
520         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
521       }
522       /* Put any stashed elements back into the min heap */
523       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
524       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
525     }
526   }
527   ierr = PetscFree(bb);CHKERRQ(ierr);
528   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
529 
530   /* Column indices are in the list of free space */
531   /* Allocate space for cj, initialize cj, and */
532   /* destroy list of free space and other temporary array(s) */
533   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
534   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
535 
536   /* put together the new symbolic matrix */
537   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
538   (*C)->rmap->bs = A->rmap->bs;
539   (*C)->cmap->bs = B->cmap->bs;
540 
541   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
542   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
543   c = (Mat_SeqAIJ *)((*C)->data);
544   c->free_a   = PETSC_TRUE;
545   c->free_ij  = PETSC_TRUE;
546   c->nonew    = 0;
547   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
548 
549   /* set MatInfo */
550   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
551   if (afill < 1.0) afill = 1.0;
552   c->maxnz                     = ci[am];
553   c->nz                        = ci[am];
554   (*C)->info.mallocs           = ndouble;
555   (*C)->info.fill_ratio_given  = fill;
556   (*C)->info.fill_ratio_needed = afill;
557 
558 #if defined(PETSC_USE_INFO)
559   if (ci[am]) {
560     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
561     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
562   } else {
563     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
564   }
565 #endif
566   PetscFunctionReturn(0);
567 }
568 
569 #undef __FUNCT__
570 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap"
571 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
572 {
573   PetscErrorCode     ierr;
574   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
575   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
576   PetscInt           *ci,*cj,*bb;
577   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
578   PetscReal          afill;
579   PetscInt           i,j,col,ndouble = 0;
580   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
581   PetscHeap          h;
582   PetscBT            bt;
583 
584   PetscFunctionBegin;
585   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
586   /*---------------------------------------------------------------------------------------------*/
587   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
588   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
589   ci[0] = 0;
590 
591   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
592   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
593   current_space = free_space;
594 
595   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
596   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
597   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
598 
599   /* Determine ci and cj */
600   for (i=0; i<am; i++) {
601     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
602     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
603     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
604     ci[i+1] = ci[i];
605     /* Populate the min heap */
606     for (j=0; j<anzi; j++) {
607       PetscInt brow = acol[j];
608       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
609         PetscInt bcol = bj[bb[j]];
610         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
611           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
612           bb[j]++;
613           break;
614         }
615       }
616     }
617     /* Pick off the min element, adding it to free space */
618     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
619     while (j >= 0) {
620       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
621         fptr = PETSC_NULL;                      /* need PetscBTMemzero */
622         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
623         ndouble++;
624       }
625       *(current_space->array++) = col;
626       current_space->local_used++;
627       current_space->local_remaining--;
628       ci[i+1]++;
629 
630       /* stash if anything else remains in this row of B */
631       for ( ; bb[j] < bi[acol[j]+1]; bb[j]++) {
632         PetscInt bcol = bj[bb[j]];
633         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
634           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
635           bb[j]++;
636           break;
637         }
638       }
639       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
640     }
641     if (fptr) {                 /* Clear the bits for this row */
642       for ( ; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
643     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
644       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
645     }
646   }
647   ierr = PetscFree(bb);CHKERRQ(ierr);
648   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
649   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
650 
651   /* Column indices are in the list of free space */
652   /* Allocate space for cj, initialize cj, and */
653   /* destroy list of free space and other temporary array(s) */
654   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
655   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
656 
657   /* put together the new symbolic matrix */
658   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
659   (*C)->rmap->bs = A->rmap->bs;
660   (*C)->cmap->bs = B->cmap->bs;
661 
662   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
663   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
664   c = (Mat_SeqAIJ *)((*C)->data);
665   c->free_a   = PETSC_TRUE;
666   c->free_ij  = PETSC_TRUE;
667   c->nonew    = 0;
668   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
669 
670   /* set MatInfo */
671   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
672   if (afill < 1.0) afill = 1.0;
673   c->maxnz                     = ci[am];
674   c->nz                        = ci[am];
675   (*C)->info.mallocs           = ndouble;
676   (*C)->info.fill_ratio_given  = fill;
677   (*C)->info.fill_ratio_needed = afill;
678 
679 #if defined(PETSC_USE_INFO)
680   if (ci[am]) {
681     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
682     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
683   } else {
684     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
685   }
686 #endif
687   PetscFunctionReturn(0);
688 }
689 
690 /* This routine is not used. Should be removed! */
691 #undef __FUNCT__
692 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
693 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
694 {
695   PetscErrorCode ierr;
696 
697   PetscFunctionBegin;
698   if (scall == MAT_INITIAL_MATRIX){
699     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
700   }
701   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 
705 #undef __FUNCT__
706 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
707 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
708 {
709   PetscErrorCode      ierr;
710   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
711 
712   PetscFunctionBegin;
713   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
714   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
715   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
716   ierr = PetscFree(multtrans);CHKERRQ(ierr);
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
722 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
723 {
724   PetscErrorCode      ierr;
725   PetscContainer      container;
726   Mat_MatMatTransMult *multtrans=PETSC_NULL;
727 
728   PetscFunctionBegin;
729   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
730   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
731   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
732   A->ops->destroy   = multtrans->destroy;
733   if (A->ops->destroy) {
734     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
735   }
736   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
737   PetscFunctionReturn(0);
738 }
739 
740 #undef __FUNCT__
741 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
742 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
743 {
744   PetscErrorCode      ierr;
745   Mat                 Bt;
746   PetscInt            *bti,*btj;
747   Mat_MatMatTransMult *multtrans;
748   PetscContainer      container;
749 
750   PetscFunctionBegin;
751    /* create symbolic Bt */
752   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
753   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
754   Bt->rmap->bs = A->cmap->bs;
755   Bt->cmap->bs = B->cmap->bs;
756 
757   /* get symbolic C=A*Bt */
758   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
759 
760   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
761   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
762 
763   /* attach the supporting struct to C */
764   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
765   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
766   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
767   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
768   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
769 
770   multtrans->usecoloring = PETSC_FALSE;
771   multtrans->destroy = (*C)->ops->destroy;
772   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
773 
774   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
775   if (multtrans->usecoloring){
776     /* Create MatTransposeColoring from symbolic C=A*B^T */
777     MatTransposeColoring matcoloring;
778     ISColoring           iscoloring;
779     Mat                  Bt_dense,C_dense;
780 
781     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
782     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
783     multtrans->matcoloring = matcoloring;
784     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
785 
786     /* Create Bt_dense and C_dense = A*Bt_dense */
787     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
788     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
789     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
790     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
791     Bt_dense->assembled = PETSC_TRUE;
792     multtrans->Bt_den = Bt_dense;
793 
794     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
795     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
796     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
797     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
798     Bt_dense->assembled = PETSC_TRUE;
799     multtrans->ABt_den = C_dense;
800 
801 #if defined(PETSC_USE_INFO)
802     {
803     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
804     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
805     }
806 #endif
807   }
808   /* clean up */
809   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
810   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
811 
812 
813 
814 #if defined(INEFFICIENT_ALGORITHM)
815   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
816   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
817   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
818   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
819   PetscInt           am=A->rmap->N,bm=B->rmap->N;
820   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
821   MatScalar          *ca;
822   PetscBT            lnkbt;
823   PetscReal          afill;
824 
825   /* Allocate row pointer array ci  */
826   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
827   ci[0] = 0;
828 
829   /* Create and initialize a linked list for C columns */
830   nlnk = bm+1;
831   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
832 
833   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
834   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
835   current_space = free_space;
836 
837   /* Determine symbolic info for each row of the product A*B^T: */
838   for (i=0; i<am; i++) {
839     anzi = ai[i+1] - ai[i];
840     cnzi = 0;
841     acol = aj + ai[i];
842     for (j=0; j<bm; j++){
843       bnzj = bi[j+1] - bi[j];
844       bcol= bj + bi[j];
845       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
846       ka = 0; kb = 0;
847       while (ka < anzi && kb < bnzj){
848         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
849         if (ka == anzi) break;
850         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
851         if (kb == bnzj) break;
852         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
853           index[0] = j;
854           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
855           cnzi++;
856           break;
857         }
858       }
859     }
860 
861     /* If free space is not available, make more free space */
862     /* Double the amount of total space in the list */
863     if (current_space->local_remaining<cnzi) {
864       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
865       nspacedouble++;
866     }
867 
868     /* Copy data into free space, then initialize lnk */
869     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
870     current_space->array           += cnzi;
871     current_space->local_used      += cnzi;
872     current_space->local_remaining -= cnzi;
873 
874     ci[i+1] = ci[i] + cnzi;
875   }
876 
877 
878   /* Column indices are in the list of free space.
879      Allocate array cj, copy column indices to cj, and destroy list of free space */
880   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
881   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
882   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
883 
884   /* Allocate space for ca */
885   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
886   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
887 
888   /* put together the new symbolic matrix */
889   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
890   (*C)->rmap->bs = A->cmap->bs;
891   (*C)->cmap->bs = B->cmap->bs;
892 
893   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
894   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
895   c = (Mat_SeqAIJ *)((*C)->data);
896   c->free_a   = PETSC_TRUE;
897   c->free_ij  = PETSC_TRUE;
898   c->nonew    = 0;
899 
900   /* set MatInfo */
901   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
902   if (afill < 1.0) afill = 1.0;
903   c->maxnz                     = ci[am];
904   c->nz                        = ci[am];
905   (*C)->info.mallocs           = nspacedouble;
906   (*C)->info.fill_ratio_given  = fill;
907   (*C)->info.fill_ratio_needed = afill;
908 
909 #if defined(PETSC_USE_INFO)
910   if (ci[am]) {
911     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
912     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
913   } else {
914     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
915   }
916 #endif
917 #endif
918   PetscFunctionReturn(0);
919 }
920 
921 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
922 #undef __FUNCT__
923 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
924 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
925 {
926   PetscErrorCode ierr;
927   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
928   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
929   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
930   PetscLogDouble flops=0.0;
931   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
932   Mat_MatMatTransMult *multtrans;
933   PetscContainer      container;
934 #if defined(USE_ARRAY)
935   MatScalar      *spdot;
936 #endif
937 
938   PetscFunctionBegin;
939   /* clear old values in C */
940   if (!c->a){
941     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
942     c->a      = ca;
943     c->free_a = PETSC_TRUE;
944   } else {
945     ca =  c->a;
946   }
947   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
948 
949   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
950   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
951   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
952   if (multtrans->usecoloring){
953     MatTransposeColoring  matcoloring = multtrans->matcoloring;
954     Mat                   Bt_dense;
955     PetscInt              m,n;
956     Mat C_dense = multtrans->ABt_den;
957 
958     Bt_dense = multtrans->Bt_den;
959     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
960 
961     /* Get Bt_dense by Apply MatTransposeColoring to B */
962     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
963 
964     /* C_dense = A*Bt_dense */
965     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
966 
967     /* Recover C from C_dense */
968     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
969     PetscFunctionReturn(0);
970   }
971 
972 #if defined(USE_ARRAY)
973   /* allocate an array for implementing sparse inner-product */
974   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
975   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
976 #endif
977 
978   for (i=0; i<cm; i++) {
979     anzi = ai[i+1] - ai[i];
980     acol = aj + ai[i];
981     aval = aa + ai[i];
982     cnzi = ci[i+1] - ci[i];
983     ccol = cj + ci[i];
984     cval = ca + ci[i];
985     for (j=0; j<cnzi; j++){
986       brow = ccol[j];
987       bnzj = bi[brow+1] - bi[brow];
988       bcol = bj + bi[brow];
989       bval = ba + bi[brow];
990 
991       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
992 #if defined(USE_ARRAY)
993       /* put ba to spdot array */
994       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
995       /* c(i,j)=A[i,:]*B[j,:]^T */
996       for (nexta=0; nexta<anzi; nexta++){
997         cval[j] += spdot[acol[nexta]]*aval[nexta];
998       }
999       /* zero spdot array */
1000       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
1001 #else
1002       nexta = 0; nextb = 0;
1003       while (nexta<anzi && nextb<bnzj){
1004         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
1005         if (nexta == anzi) break;
1006         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
1007         if (nextb == bnzj) break;
1008         if (acol[nexta] == bcol[nextb]){
1009           cval[j] += aval[nexta]*bval[nextb];
1010           nexta++; nextb++;
1011           flops += 2;
1012         }
1013       }
1014 #endif
1015     }
1016   }
1017   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1018   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1019   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1020 #if defined(USE_ARRAY)
1021   ierr = PetscFree(spdot);
1022 #endif
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 #undef __FUNCT__
1027 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
1028 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
1029   PetscErrorCode ierr;
1030 
1031   PetscFunctionBegin;
1032   if (scall == MAT_INITIAL_MATRIX){
1033     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1034   }
1035   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
1036   PetscFunctionReturn(0);
1037 }
1038 
1039 #undef __FUNCT__
1040 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
1041 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1042 {
1043   PetscErrorCode ierr;
1044   Mat            At;
1045   PetscInt       *ati,*atj;
1046 
1047   PetscFunctionBegin;
1048   /* create symbolic At */
1049   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1050   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
1051   At->rmap->bs = A->cmap->bs;
1052   At->cmap->bs = B->cmap->bs;
1053 
1054   /* get symbolic C=At*B */
1055   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1056 
1057   /* clean up */
1058   ierr = MatDestroy(&At);CHKERRQ(ierr);
1059   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1060   PetscFunctionReturn(0);
1061 }
1062 
1063 #undef __FUNCT__
1064 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
1065 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1066 {
1067   PetscErrorCode ierr;
1068   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1069   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1070   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1071   PetscLogDouble flops=0.0;
1072   MatScalar      *aa=a->a,*ba,*ca,*caj;
1073 
1074   PetscFunctionBegin;
1075   if (!c->a){
1076     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1077     c->a      = ca;
1078     c->free_a = PETSC_TRUE;
1079   } else {
1080     ca = c->a;
1081   }
1082   /* clear old values in C */
1083   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1084 
1085   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1086   for (i=0;i<am;i++) {
1087     bj   = b->j + bi[i];
1088     ba   = b->a + bi[i];
1089     bnzi = bi[i+1] - bi[i];
1090     anzi = ai[i+1] - ai[i];
1091     for (j=0; j<anzi; j++) {
1092       nextb = 0;
1093       crow  = *aj++;
1094       cjj   = cj + ci[crow];
1095       caj   = ca + ci[crow];
1096       /* perform sparse axpy operation.  Note cjj includes bj. */
1097       for (k=0; nextb<bnzi; k++) {
1098         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1099           caj[k] += (*aa)*(*(ba+nextb));
1100           nextb++;
1101         }
1102       }
1103       flops += 2*bnzi;
1104       aa++;
1105     }
1106   }
1107 
1108   /* Assemble the final matrix and clean up */
1109   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1110   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1111   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 EXTERN_C_BEGIN
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1118 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1119 {
1120   PetscErrorCode ierr;
1121 
1122   PetscFunctionBegin;
1123   if (scall == MAT_INITIAL_MATRIX){
1124     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1125   }
1126   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 EXTERN_C_END
1130 
1131 #undef __FUNCT__
1132 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1133 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1134 {
1135   PetscErrorCode ierr;
1136 
1137   PetscFunctionBegin;
1138   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1139   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 #undef __FUNCT__
1144 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1145 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1146 {
1147   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1148   PetscErrorCode ierr;
1149   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1150   MatScalar      *aa;
1151   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1152   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1153 
1154   PetscFunctionBegin;
1155   if (!cm || !cn) PetscFunctionReturn(0);
1156   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1157   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1158   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1159   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1160   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1161   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1162   for (col=0; col<cn-4; col += 4){  /* over columns of C */
1163     colam = col*am;
1164     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1165       r1 = r2 = r3 = r4 = 0.0;
1166       n   = a->i[i+1] - a->i[i];
1167       aj  = a->j + a->i[i];
1168       aa  = a->a + a->i[i];
1169       for (j=0; j<n; j++) {
1170         r1 += (*aa)*b1[*aj];
1171         r2 += (*aa)*b2[*aj];
1172         r3 += (*aa)*b3[*aj];
1173         r4 += (*aa++)*b4[*aj++];
1174       }
1175       c[colam + i]       = r1;
1176       c[colam + am + i]  = r2;
1177       c[colam + am2 + i] = r3;
1178       c[colam + am3 + i] = r4;
1179     }
1180     b1 += bm4;
1181     b2 += bm4;
1182     b3 += bm4;
1183     b4 += bm4;
1184   }
1185   for (;col<cn; col++){     /* over extra columns of C */
1186     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1187       r1 = 0.0;
1188       n   = a->i[i+1] - a->i[i];
1189       aj  = a->j + a->i[i];
1190       aa  = a->a + a->i[i];
1191 
1192       for (j=0; j<n; j++) {
1193         r1 += (*aa++)*b1[*aj++];
1194       }
1195       c[col*am + i]     = r1;
1196     }
1197     b1 += bm;
1198   }
1199   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1200   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1201   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1202   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1203   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 /*
1208    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1209 */
1210 #undef __FUNCT__
1211 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1212 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1213 {
1214   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1215   PetscErrorCode ierr;
1216   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1217   MatScalar      *aa;
1218   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1219   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1220 
1221   PetscFunctionBegin;
1222   if (!cm || !cn) PetscFunctionReturn(0);
1223   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1224   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1225   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1226 
1227   if (a->compressedrow.use){ /* use compressed row format */
1228     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1229       colam = col*am;
1230       arm   = a->compressedrow.nrows;
1231       ii    = a->compressedrow.i;
1232       ridx  = a->compressedrow.rindex;
1233       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1234 	r1 = r2 = r3 = r4 = 0.0;
1235 	n   = ii[i+1] - ii[i];
1236 	aj  = a->j + ii[i];
1237 	aa  = a->a + ii[i];
1238 	for (j=0; j<n; j++) {
1239 	  r1 += (*aa)*b1[*aj];
1240 	  r2 += (*aa)*b2[*aj];
1241 	  r3 += (*aa)*b3[*aj];
1242 	  r4 += (*aa++)*b4[*aj++];
1243 	}
1244 	c[colam       + ridx[i]] += r1;
1245 	c[colam + am  + ridx[i]] += r2;
1246 	c[colam + am2 + ridx[i]] += r3;
1247 	c[colam + am3 + ridx[i]] += r4;
1248       }
1249       b1 += bm4;
1250       b2 += bm4;
1251       b3 += bm4;
1252       b4 += bm4;
1253     }
1254     for (;col<cn; col++){     /* over extra columns of C */
1255       colam = col*am;
1256       arm   = a->compressedrow.nrows;
1257       ii    = a->compressedrow.i;
1258       ridx  = a->compressedrow.rindex;
1259       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1260 	r1 = 0.0;
1261 	n   = ii[i+1] - ii[i];
1262 	aj  = a->j + ii[i];
1263 	aa  = a->a + ii[i];
1264 
1265 	for (j=0; j<n; j++) {
1266 	  r1 += (*aa++)*b1[*aj++];
1267 	}
1268 	c[colam + ridx[i]] += r1;
1269       }
1270       b1 += bm;
1271     }
1272   } else {
1273     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1274       colam = col*am;
1275       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1276 	r1 = r2 = r3 = r4 = 0.0;
1277 	n   = a->i[i+1] - a->i[i];
1278 	aj  = a->j + a->i[i];
1279 	aa  = a->a + a->i[i];
1280 	for (j=0; j<n; j++) {
1281 	  r1 += (*aa)*b1[*aj];
1282 	  r2 += (*aa)*b2[*aj];
1283 	  r3 += (*aa)*b3[*aj];
1284 	  r4 += (*aa++)*b4[*aj++];
1285 	}
1286 	c[colam + i]       += r1;
1287 	c[colam + am + i]  += r2;
1288 	c[colam + am2 + i] += r3;
1289 	c[colam + am3 + i] += r4;
1290       }
1291       b1 += bm4;
1292       b2 += bm4;
1293       b3 += bm4;
1294       b4 += bm4;
1295     }
1296     for (;col<cn; col++){     /* over extra columns of C */
1297       colam = col*am;
1298       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1299 	r1 = 0.0;
1300 	n   = a->i[i+1] - a->i[i];
1301 	aj  = a->j + a->i[i];
1302 	aa  = a->a + a->i[i];
1303 
1304 	for (j=0; j<n; j++) {
1305 	  r1 += (*aa++)*b1[*aj++];
1306 	}
1307 	c[colam + i]     += r1;
1308       }
1309       b1 += bm;
1310     }
1311   }
1312   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1313   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1314   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1320 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1321 {
1322   PetscErrorCode ierr;
1323   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1324   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1325   PetscInt       *bi=b->i,*bj=b->j;
1326   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1327   MatScalar      *btval,*btval_den,*ba=b->a;
1328   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1329 
1330   PetscFunctionBegin;
1331   btval_den=btdense->v;
1332   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1333   for (k=0; k<ncolors; k++) {
1334     ncolumns = coloring->ncolumns[k];
1335     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1336       col   = *(columns + colorforcol[k] + l);
1337       btcol = bj + bi[col];
1338       btval = ba + bi[col];
1339       anz   = bi[col+1] - bi[col];
1340       for (j=0; j<anz; j++){
1341         brow            = btcol[j];
1342         btval_den[brow] = btval[j];
1343       }
1344     }
1345     btval_den += m;
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1352 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1353 {
1354   PetscErrorCode ierr;
1355   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1356   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1357   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1358   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1359 
1360   PetscFunctionBegin;
1361   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
1362   ierr = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1363   cp_den = ca_den;
1364   for (k=0; k<ncolors; k++) {
1365     nrows = matcoloring->nrows[k];
1366     row   = rows  + colorforrow[k];
1367     idx   = spidx + colorforrow[k];
1368     for (l=0; l<nrows; l++){
1369       ca[idx[l]] = cp_den[row[l]];
1370     }
1371     cp_den += m;
1372   }
1373   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 /*
1378  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1379  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1380  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1381  */
1382 #undef __FUNCT__
1383 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1384 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1385 {
1386   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1387   PetscErrorCode ierr;
1388   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1389   PetscInt       nz = a->i[m],row,*jj,mr,col;
1390   PetscInt       *cspidx;
1391 
1392   PetscFunctionBegin;
1393   *nn = n;
1394   if (!ia) PetscFunctionReturn(0);
1395   if (symmetric) {
1396     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1397     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,(PetscInt**)ia,(PetscInt**)ja);CHKERRQ(ierr);
1398   } else {
1399     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1400     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1401     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1402     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1403     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1404     jj = a->j;
1405     for (i=0; i<nz; i++) {
1406       collengths[jj[i]]++;
1407     }
1408     cia[0] = oshift;
1409     for (i=0; i<n; i++) {
1410       cia[i+1] = cia[i] + collengths[i];
1411     }
1412     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1413     jj   = a->j;
1414     for (row=0; row<m; row++) {
1415       mr = a->i[row+1] - a->i[row];
1416       for (i=0; i<mr; i++) {
1417         col = *jj++;
1418         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1419         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1420       }
1421     }
1422     ierr = PetscFree(collengths);CHKERRQ(ierr);
1423     *ia = cia; *ja = cja;
1424     *spidx = cspidx;
1425   }
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1431 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1432 {
1433   PetscErrorCode ierr;
1434 
1435   PetscFunctionBegin;
1436   if (!ia) PetscFunctionReturn(0);
1437 
1438   ierr = PetscFree(*ia);CHKERRQ(ierr);
1439   ierr = PetscFree(*ja);CHKERRQ(ierr);
1440   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 #undef __FUNCT__
1445 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1446 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1447 {
1448   PetscErrorCode ierr;
1449   PetscInt       i,n,nrows,N,j,k,m,ncols,col,cm;
1450   const PetscInt *is,*ci,*cj,*row_idx;
1451   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1452   IS             *isa;
1453   PetscBool      flg1,flg2;
1454   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1455   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1456   PetscInt       *colorforcol,*columns,*columns_i;
1457 
1458   PetscFunctionBegin;
1459   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1460 
1461   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1462   ierr = PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1463   ierr = PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1464   if (flg1 || flg2) {
1465     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1466   }
1467 
1468   N         = mat->cmap->N/bs;
1469   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1470   c->N      = mat->cmap->N/bs;
1471   c->m      = mat->rmap->N/bs;
1472   c->rstart = 0;
1473 
1474   c->ncolors = nis;
1475   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1476   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1477   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1478   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1479   colorforrow[0]    = 0;
1480   rows_i            = rows;
1481   columnsforspidx_i = columnsforspidx;
1482 
1483   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1484   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1485   colorforcol[0] = 0;
1486   columns_i      = columns;
1487 
1488   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);CHKERRQ(ierr); /* column-wise storage of mat */
1489 
1490   cm = c->m;
1491   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1492   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1493   for (i=0; i<nis; i++) {
1494     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1495     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1496     c->ncolumns[i] = n;
1497     if (n) {
1498       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1499     }
1500     colorforcol[i+1] = colorforcol[i] + n;
1501     columns_i       += n;
1502 
1503     /* fast, crude version requires O(N*N) work */
1504     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1505 
1506     /* loop over columns*/
1507     for (j=0; j<n; j++) {
1508       col     = is[j];
1509       row_idx = cj + ci[col];
1510       m       = ci[col+1] - ci[col];
1511       /* loop over columns marking them in rowhit */
1512       for (k=0; k<m; k++) {
1513         idxhit[*row_idx]   = spidx[ci[col] + k];
1514         rowhit[*row_idx++] = col + 1;
1515       }
1516     }
1517     /* count the number of hits */
1518     nrows = 0;
1519     for (j=0; j<cm; j++) {
1520       if (rowhit[j]) nrows++;
1521     }
1522     c->nrows[i]      = nrows;
1523     colorforrow[i+1] = colorforrow[i] + nrows;
1524 
1525     nrows       = 0;
1526     for (j=0; j<cm; j++) {
1527       if (rowhit[j]) {
1528         rows_i[nrows]            = j;
1529         columnsforspidx_i[nrows] = idxhit[j];
1530         nrows++;
1531       }
1532     }
1533     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1534     rows_i += nrows; columnsforspidx_i += nrows;
1535   }
1536   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);CHKERRQ(ierr);
1537   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1538   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1539 #if defined(PETSC_USE_DEBUG)
1540   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1541 #endif
1542 
1543   c->colorforrow     = colorforrow;
1544   c->rows            = rows;
1545   c->columnsforspidx = columnsforspidx;
1546   c->colorforcol     = colorforcol;
1547   c->columns         = columns;
1548   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1549   PetscFunctionReturn(0);
1550 }
1551