xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 5c8f6a953e7ed1c81f507d64aebddb11080b60e9)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/utils/petscheap.h>
10 #include <petscbt.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
15 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
16 {
17   PetscErrorCode ierr;
18   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE,heap = PETSC_FALSE,btheap = PETSC_FALSE;
19 
20   PetscFunctionBegin;
21   if (scall == MAT_INITIAL_MATRIX) {
22     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
23     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
24     ierr = PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,PETSC_NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsBool("-matmatmult_heap","Use heap implementation of symbolic factorization C=A*B","",heap,&heap,PETSC_NULL);CHKERRQ(ierr);
26     ierr = PetscOptionsBool("-matmatmult_btheap","Use btheap implementation of symbolic factorization C=A*B","",btheap,&btheap,PETSC_NULL);CHKERRQ(ierr);
27     ierr = PetscOptionsEnd();CHKERRQ(ierr);
28     if (scalable_fast) {
29       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);CHKERRQ(ierr);
30     } else if (scalable) {
31       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);CHKERRQ(ierr);
32     } else if (heap) {
33       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);CHKERRQ(ierr);
34     } else if (btheap) {
35       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);CHKERRQ(ierr);
36     } else {
37       ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
38     }
39   }
40 
41   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
42   PetscFunctionReturn(0);
43 }
44 
45 #undef __FUNCT__
46 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
47 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
48 {
49   PetscErrorCode     ierr;
50   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
51   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
52   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
53   PetscReal          afill;
54   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
55   PetscBT            lnkbt;
56   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
57 
58   PetscFunctionBegin;
59   /* Get ci and cj */
60   /*---------------*/
61   /* Allocate ci array, arrays for fill computation and */
62   /* free space for accumulating nonzero column info */
63   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
64   ci[0] = 0;
65 
66   /* create and initialize a linked list */
67   nlnk_max = a->rmax*b->rmax;
68   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
69   ierr = PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);CHKERRQ(ierr);
70 
71   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
72   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
73   current_space = free_space;
74 
75   /* Determine ci and cj */
76   for (i=0; i<am; i++) {
77     anzi = ai[i+1] - ai[i];
78     aj   = a->j + ai[i];
79     for (j=0; j<anzi; j++) {
80       brow = aj[j];
81       bnzj = bi[brow+1] - bi[brow];
82       bj   = b->j + bi[brow];
83       /* add non-zero cols of B into the sorted linked list lnk */
84       ierr = PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);CHKERRQ(ierr);
85     }
86     cnzi = lnk[0];
87 
88     /* If free space is not available, make more free space */
89     /* Double the amount of total space in the list */
90     if (current_space->local_remaining<cnzi) {
91       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
92       ndouble++;
93     }
94 
95     /* Copy data into free space, then initialize lnk */
96     ierr = PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
97     current_space->array           += cnzi;
98     current_space->local_used      += cnzi;
99     current_space->local_remaining -= cnzi;
100     ci[i+1] = ci[i] + cnzi;
101   }
102 
103   /* Column indices are in the list of free space */
104   /* Allocate space for cj, initialize cj, and */
105   /* destroy list of free space and other temporary array(s) */
106   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
107   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
108   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
109 
110   /* put together the new symbolic matrix */
111   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
112   (*C)->rmap->bs = A->rmap->bs;
113   (*C)->cmap->bs = B->cmap->bs;
114 
115   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
116   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
117   c = (Mat_SeqAIJ *)((*C)->data);
118   c->free_a  = PETSC_FALSE;
119   c->free_ij = PETSC_TRUE;
120   c->nonew   = 0;
121   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
122 
123   /* set MatInfo */
124   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
125   if (afill < 1.0) afill = 1.0;
126   c->maxnz                     = ci[am];
127   c->nz                        = ci[am];
128   (*C)->info.mallocs           = ndouble;
129   (*C)->info.fill_ratio_given  = fill;
130   (*C)->info.fill_ratio_needed = afill;
131 
132 #if defined(PETSC_USE_INFO)
133   if (ci[am]) {
134     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
135     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
136   } else {
137     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
138   }
139 #endif
140   PetscFunctionReturn(0);
141 }
142 
143 #undef __FUNCT__
144 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
145 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
146 {
147   PetscErrorCode ierr;
148   PetscLogDouble flops=0.0;
149   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
150   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
151   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
152   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
153   PetscInt       am=A->rmap->n,cm=C->rmap->n;
154   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
155   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
156   PetscScalar    *ab_dense;
157 
158   PetscFunctionBegin;
159   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
160   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
161     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
162     c->a      = ca;
163     c->free_a = PETSC_TRUE;
164 
165     ierr = PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);CHKERRQ(ierr);
166     ierr = PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
167     c->matmult_abdense = ab_dense;
168   } else {
169     ca       = c->a;
170     ab_dense = c->matmult_abdense;
171   }
172 
173   /* clean old values in C */
174   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
175   /* Traverse A row-wise. */
176   /* Build the ith row in C by summing over nonzero columns in A, */
177   /* the rows of B corresponding to nonzeros of A. */
178   for (i=0; i<am; i++) {
179     anzi = ai[i+1] - ai[i];
180     for (j=0; j<anzi; j++) {
181       brow = aj[j];
182       bnzi = bi[brow+1] - bi[brow];
183       bjj  = bj + bi[brow];
184       baj  = ba + bi[brow];
185       /* perform dense axpy */
186       valtmp = aa[j];
187       for (k=0; k<bnzi; k++) {
188         ab_dense[bjj[k]] += valtmp*baj[k];
189       }
190       flops += 2*bnzi;
191     }
192     aj += anzi; aa += anzi;
193 
194     cnzi = ci[i+1] - ci[i];
195     for (k=0; k<cnzi; k++) {
196       ca[k]          += ab_dense[cj[k]];
197       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
198     }
199     flops += cnzi;
200     cj += cnzi; ca += cnzi;
201   }
202   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
204   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable"
210 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
211 {
212   PetscErrorCode ierr;
213   PetscLogDouble flops=0.0;
214   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
215   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
216   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
217   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
218   PetscInt       am=A->rmap->N,cm=C->rmap->N;
219   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
220   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
221   PetscInt       nextb;
222 
223   PetscFunctionBegin;
224   /* clean old values in C */
225   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
226   /* Traverse A row-wise. */
227   /* Build the ith row in C by summing over nonzero columns in A, */
228   /* the rows of B corresponding to nonzeros of A. */
229   for (i=0;i<am;i++) {
230     anzi = ai[i+1] - ai[i];
231     cnzi = ci[i+1] - ci[i];
232     for (j=0;j<anzi;j++) {
233       brow = aj[j];
234       bnzi = bi[brow+1] - bi[brow];
235       bjj  = bj + bi[brow];
236       baj  = ba + bi[brow];
237       /* perform sparse axpy */
238       valtmp = aa[j];
239       nextb  = 0;
240       for (k=0; nextb<bnzi; k++) {
241         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
242           ca[k] += valtmp*baj[nextb++];
243         }
244       }
245       flops += 2*bnzi;
246     }
247     aj += anzi; aa += anzi;
248     cj += cnzi; ca += cnzi;
249   }
250 
251   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
252   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
253   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 #undef __FUNCT__
258 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast"
259 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
260 {
261   PetscErrorCode     ierr;
262   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
263   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
264   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
265   MatScalar          *ca;
266   PetscReal          afill;
267   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
268   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
269 
270   PetscFunctionBegin;
271   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
272   /*-----------------------------------------------------------------------------------------*/
273   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
274   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
275   ci[0] = 0;
276 
277   /* create and initialize a linked list */
278   nlnk_max = a->rmax*b->rmax;
279   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
280   ierr = PetscLLCondensedCreate_fast(nlnk_max,&lnk);CHKERRQ(ierr);
281 
282   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
283   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
284   current_space = free_space;
285 
286   /* Determine ci and cj */
287   for (i=0; i<am; i++) {
288     anzi = ai[i+1] - ai[i];
289     aj   = a->j + ai[i];
290     for (j=0; j<anzi; j++) {
291       brow = aj[j];
292       bnzj = bi[brow+1] - bi[brow];
293       bj   = b->j + bi[brow];
294       /* add non-zero cols of B into the sorted linked list lnk */
295       ierr = PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);CHKERRQ(ierr);
296     }
297     cnzi = lnk[1];
298 
299     /* If free space is not available, make more free space */
300     /* Double the amount of total space in the list */
301     if (current_space->local_remaining<cnzi) {
302       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
303       ndouble++;
304     }
305 
306     /* Copy data into free space, then initialize lnk */
307     ierr = PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);CHKERRQ(ierr);
308     current_space->array           += cnzi;
309     current_space->local_used      += cnzi;
310     current_space->local_remaining -= cnzi;
311     ci[i+1] = ci[i] + cnzi;
312   }
313 
314   /* Column indices are in the list of free space */
315   /* Allocate space for cj, initialize cj, and */
316   /* destroy list of free space and other temporary array(s) */
317   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
318   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
319   ierr = PetscLLCondensedDestroy_fast(lnk);CHKERRQ(ierr);
320 
321   /* Allocate space for ca */
322   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
323   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
324 
325   /* put together the new symbolic matrix */
326   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
327   (*C)->rmap->bs = A->rmap->bs;
328   (*C)->cmap->bs = B->cmap->bs;
329 
330   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
331   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
332   c = (Mat_SeqAIJ *)((*C)->data);
333   c->free_a   = PETSC_TRUE;
334   c->free_ij  = PETSC_TRUE;
335   c->nonew    = 0;
336   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
337 
338   /* set MatInfo */
339   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
340   if (afill < 1.0) afill = 1.0;
341   c->maxnz                     = ci[am];
342   c->nz                        = ci[am];
343   (*C)->info.mallocs           = ndouble;
344   (*C)->info.fill_ratio_given  = fill;
345   (*C)->info.fill_ratio_needed = afill;
346 
347 #if defined(PETSC_USE_INFO)
348   if (ci[am]) {
349     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
350     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
351   } else {
352     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
353   }
354 #endif
355   PetscFunctionReturn(0);
356 }
357 
358 
359 #undef __FUNCT__
360 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable"
361 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
362 {
363   PetscErrorCode     ierr;
364   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
365   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
366   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
367   MatScalar          *ca;
368   PetscReal          afill;
369   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
370   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
371 
372   PetscFunctionBegin;
373   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
374   /*---------------------------------------------------------------------------------------------*/
375   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
376   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
377   ci[0] = 0;
378 
379   /* create and initialize a linked list */
380   nlnk_max = a->rmax*b->rmax;
381   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
382   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
383 
384   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
385   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
386   current_space = free_space;
387 
388   /* Determine ci and cj */
389   for (i=0; i<am; i++) {
390     anzi = ai[i+1] - ai[i];
391     aj   = a->j + ai[i];
392     for (j=0; j<anzi; j++) {
393       brow = aj[j];
394       bnzj = bi[brow+1] - bi[brow];
395       bj   = b->j + bi[brow];
396       /* add non-zero cols of B into the sorted linked list lnk */
397       ierr = PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);CHKERRQ(ierr);
398     }
399     cnzi = lnk[0];
400 
401     /* If free space is not available, make more free space */
402     /* Double the amount of total space in the list */
403     if (current_space->local_remaining<cnzi) {
404       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
405       ndouble++;
406     }
407 
408     /* Copy data into free space, then initialize lnk */
409     ierr = PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);CHKERRQ(ierr);
410     current_space->array           += cnzi;
411     current_space->local_used      += cnzi;
412     current_space->local_remaining -= cnzi;
413     ci[i+1] = ci[i] + cnzi;
414   }
415 
416   /* Column indices are in the list of free space */
417   /* Allocate space for cj, initialize cj, and */
418   /* destroy list of free space and other temporary array(s) */
419   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
420   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
421   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
422 
423   /* Allocate space for ca */
424   /*-----------------------*/
425   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
426   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
427 
428   /* put together the new symbolic matrix */
429   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
430   (*C)->rmap->bs = A->rmap->bs;
431   (*C)->cmap->bs = B->cmap->bs;
432 
433   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
434   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
435   c = (Mat_SeqAIJ *)((*C)->data);
436   c->free_a   = PETSC_TRUE;
437   c->free_ij  = PETSC_TRUE;
438   c->nonew    = 0;
439   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
440 
441   /* set MatInfo */
442   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
443   if (afill < 1.0) afill = 1.0;
444   c->maxnz                     = ci[am];
445   c->nz                        = ci[am];
446   (*C)->info.mallocs           = ndouble;
447   (*C)->info.fill_ratio_given  = fill;
448   (*C)->info.fill_ratio_needed = afill;
449 
450 #if defined(PETSC_USE_INFO)
451   if (ci[am]) {
452     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
453     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
454   } else {
455     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
456   }
457 #endif
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap"
463 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
464 {
465   PetscErrorCode     ierr;
466   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
467   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
468   PetscInt           *ci,*cj,*bb;
469   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
470   PetscReal          afill;
471   PetscInt           i,j,col,ndouble = 0;
472   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
473   PetscHeap          h;
474 
475   PetscFunctionBegin;
476   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
477   /*---------------------------------------------------------------------------------------------*/
478   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
479   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
480   ci[0] = 0;
481 
482   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
483   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
484   current_space = free_space;
485 
486   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
487   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
488 
489   /* Determine ci and cj */
490   for (i=0; i<am; i++) {
491     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
492     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
493     ci[i+1] = ci[i];
494     /* Populate the min heap */
495     for (j=0; j<anzi; j++) {
496       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
497       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
498         ierr = PetscHeapAdd(h,j,bj[bb[j]++]);CHKERRQ(ierr);
499       }
500     }
501     /* Pick off the min element, adding it to free space */
502     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
503     while (j >= 0) {
504       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
505         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
506         ndouble++;
507       }
508       *(current_space->array++) = col;
509       current_space->local_used++;
510       current_space->local_remaining--;
511       ci[i+1]++;
512 
513       /* stash if anything else remains in this row of B */
514       if (bb[j] < bi[acol[j]+1]) {ierr = PetscHeapStash(h,j,bj[bb[j]++]);CHKERRQ(ierr);}
515       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
516         PetscInt j2,col2;
517         ierr = PetscHeapPeek(h,&j2,&col2);CHKERRQ(ierr);
518         if (col2 != col) break;
519         ierr = PetscHeapPop(h,&j2,&col2);CHKERRQ(ierr);
520         if (bb[j2] < bi[acol[j2]+1]) {ierr = PetscHeapStash(h,j2,bj[bb[j2]++]);CHKERRQ(ierr);}
521       }
522       /* Put any stashed elements back into the min heap */
523       ierr = PetscHeapUnstash(h);CHKERRQ(ierr);
524       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
525     }
526   }
527   ierr = PetscFree(bb);CHKERRQ(ierr);
528   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
529 
530   /* Column indices are in the list of free space */
531   /* Allocate space for cj, initialize cj, and */
532   /* destroy list of free space and other temporary array(s) */
533   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
534   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
535 
536   /* put together the new symbolic matrix */
537   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
538   (*C)->rmap->bs = A->rmap->bs;
539   (*C)->cmap->bs = B->cmap->bs;
540 
541   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
542   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
543   c = (Mat_SeqAIJ *)((*C)->data);
544   c->free_a   = PETSC_TRUE;
545   c->free_ij  = PETSC_TRUE;
546   c->nonew    = 0;
547   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
548 
549   /* set MatInfo */
550   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
551   if (afill < 1.0) afill = 1.0;
552   c->maxnz                     = ci[am];
553   c->nz                        = ci[am];
554   (*C)->info.mallocs           = ndouble;
555   (*C)->info.fill_ratio_given  = fill;
556   (*C)->info.fill_ratio_needed = afill;
557 
558 #if defined(PETSC_USE_INFO)
559   if (ci[am]) {
560     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
561     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
562   } else {
563     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
564   }
565 #endif
566   PetscFunctionReturn(0);
567 }
568 
569 #undef __FUNCT__
570 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap"
571 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
572 {
573   PetscErrorCode     ierr;
574   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
575   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
576   PetscInt           *ci,*cj,*bb;
577   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
578   PetscReal          afill;
579   PetscInt           i,j,col,ndouble = 0;
580   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
581   PetscHeap          h;
582   PetscBT            bt;
583 
584   PetscFunctionBegin;
585   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
586   /*---------------------------------------------------------------------------------------------*/
587   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
588   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
589   ci[0] = 0;
590 
591   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
592   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
593   current_space = free_space;
594 
595   ierr = PetscHeapCreate(a->rmax,&h);CHKERRQ(ierr);
596   ierr = PetscMalloc(a->rmax*sizeof(PetscInt),&bb);CHKERRQ(ierr);
597   ierr = PetscBTCreate(bn,&bt);CHKERRQ(ierr);
598 
599   /* Determine ci and cj */
600   for (i=0; i<am; i++) {
601     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
602     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
603     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
604     ci[i+1] = ci[i];
605     /* Populate the min heap */
606     for (j=0; j<anzi; j++) {
607       PetscInt brow = acol[j];
608       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
609         PetscInt bcol = bj[bb[j]];
610         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
611           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
612           bb[j]++;
613           break;
614         }
615       }
616     }
617     /* Pick off the min element, adding it to free space */
618     ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
619     while (j >= 0) {
620       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
621         fptr = PETSC_NULL;                      /* need PetscBTMemzero */
622         ierr = PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),&current_space);CHKERRQ(ierr);
623         ndouble++;
624       }
625       *(current_space->array++) = col;
626       current_space->local_used++;
627       current_space->local_remaining--;
628       ci[i+1]++;
629 
630       /* stash if anything else remains in this row of B */
631       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
632         PetscInt bcol = bj[bb[j]];
633         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
634           ierr = PetscHeapAdd(h,j,bcol);CHKERRQ(ierr);
635           bb[j]++;
636           break;
637         }
638       }
639       ierr = PetscHeapPop(h,&j,&col);CHKERRQ(ierr);
640     }
641     if (fptr) {                 /* Clear the bits for this row */
642       for (; fptr<current_space->array; fptr++) {ierr = PetscBTClear(bt,*fptr);CHKERRQ(ierr);}
643     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
644       ierr = PetscBTMemzero(bn,bt);CHKERRQ(ierr);
645     }
646   }
647   ierr = PetscFree(bb);CHKERRQ(ierr);
648   ierr = PetscHeapDestroy(&h);CHKERRQ(ierr);
649   ierr = PetscBTDestroy(&bt);CHKERRQ(ierr);
650 
651   /* Column indices are in the list of free space */
652   /* Allocate space for cj, initialize cj, and */
653   /* destroy list of free space and other temporary array(s) */
654   ierr = PetscMalloc(ci[am]*sizeof(PetscInt),&cj);CHKERRQ(ierr);
655   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
656 
657   /* put together the new symbolic matrix */
658   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);CHKERRQ(ierr);
659   (*C)->rmap->bs = A->rmap->bs;
660   (*C)->cmap->bs = B->cmap->bs;
661 
662   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
663   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
664   c = (Mat_SeqAIJ *)((*C)->data);
665   c->free_a   = PETSC_TRUE;
666   c->free_ij  = PETSC_TRUE;
667   c->nonew    = 0;
668   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
669 
670   /* set MatInfo */
671   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
672   if (afill < 1.0) afill = 1.0;
673   c->maxnz                     = ci[am];
674   c->nz                        = ci[am];
675   (*C)->info.mallocs           = ndouble;
676   (*C)->info.fill_ratio_given  = fill;
677   (*C)->info.fill_ratio_needed = afill;
678 
679 #if defined(PETSC_USE_INFO)
680   if (ci[am]) {
681     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);CHKERRQ(ierr);
682     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
683   } else {
684     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
685   }
686 #endif
687   PetscFunctionReturn(0);
688 }
689 
690 /* This routine is not used. Should be removed! */
691 #undef __FUNCT__
692 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
693 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
694 {
695   PetscErrorCode ierr;
696 
697   PetscFunctionBegin;
698   if (scall == MAT_INITIAL_MATRIX) {
699     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
700   }
701   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 
705 #undef __FUNCT__
706 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
707 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
708 {
709   PetscErrorCode      ierr;
710   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
711 
712   PetscFunctionBegin;
713   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
714   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
715   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
716   ierr = PetscFree(multtrans);CHKERRQ(ierr);
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
722 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
723 {
724   PetscErrorCode      ierr;
725   PetscContainer      container;
726   Mat_MatMatTransMult *multtrans=PETSC_NULL;
727 
728   PetscFunctionBegin;
729   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
730   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
731   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
732   A->ops->destroy   = multtrans->destroy;
733   if (A->ops->destroy) {
734     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
735   }
736   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
737   PetscFunctionReturn(0);
738 }
739 
740 #undef __FUNCT__
741 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
742 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
743 {
744   PetscErrorCode      ierr;
745   Mat                 Bt;
746   PetscInt            *bti,*btj;
747   Mat_MatMatTransMult *multtrans;
748   PetscContainer      container;
749 
750   PetscFunctionBegin;
751    /* create symbolic Bt */
752   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
753   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
754   Bt->rmap->bs = A->cmap->bs;
755   Bt->cmap->bs = B->cmap->bs;
756 
757   /* get symbolic C=A*Bt */
758   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
759 
760   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
761   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
762 
763   /* attach the supporting struct to C */
764   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
765   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
766   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
767   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
768   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
769 
770   multtrans->usecoloring = PETSC_FALSE;
771   multtrans->destroy = (*C)->ops->destroy;
772   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
773 
774   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
775   if (multtrans->usecoloring) {
776     /* Create MatTransposeColoring from symbolic C=A*B^T */
777     MatTransposeColoring matcoloring;
778     ISColoring           iscoloring;
779     Mat                  Bt_dense,C_dense;
780 
781     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
782     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
783     multtrans->matcoloring = matcoloring;
784     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
785 
786     /* Create Bt_dense and C_dense = A*Bt_dense */
787     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
788     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
789     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
790     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
791     Bt_dense->assembled = PETSC_TRUE;
792     multtrans->Bt_den = Bt_dense;
793 
794     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
795     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
796     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
797     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
798     Bt_dense->assembled = PETSC_TRUE;
799     multtrans->ABt_den = C_dense;
800 
801 #if defined(PETSC_USE_INFO)
802     {
803     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
804     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));CHKERRQ(ierr);
805     }
806 #endif
807   }
808   /* clean up */
809   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
810   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
811 
812 
813 
814 #if defined(INEFFICIENT_ALGORITHM)
815   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
816   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
817   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
818   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
819   PetscInt           am=A->rmap->N,bm=B->rmap->N;
820   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
821   MatScalar          *ca;
822   PetscBT            lnkbt;
823   PetscReal          afill;
824 
825   /* Allocate row pointer array ci  */
826   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
827   ci[0] = 0;
828 
829   /* Create and initialize a linked list for C columns */
830   nlnk = bm+1;
831   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
832 
833   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
834   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
835   current_space = free_space;
836 
837   /* Determine symbolic info for each row of the product A*B^T: */
838   for (i=0; i<am; i++) {
839     anzi = ai[i+1] - ai[i];
840     cnzi = 0;
841     acol = aj + ai[i];
842     for (j=0; j<bm; j++) {
843       bnzj = bi[j+1] - bi[j];
844       bcol= bj + bi[j];
845       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
846       ka = 0; kb = 0;
847       while (ka < anzi && kb < bnzj) {
848         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
849         if (ka == anzi) break;
850         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
851         if (kb == bnzj) break;
852         if (acol[ka] == bcol[kb]) { /* add nonzero c(i,j) to lnk */
853           index[0] = j;
854           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
855           cnzi++;
856           break;
857         }
858       }
859     }
860 
861     /* If free space is not available, make more free space */
862     /* Double the amount of total space in the list */
863     if (current_space->local_remaining<cnzi) {
864       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
865       nspacedouble++;
866     }
867 
868     /* Copy data into free space, then initialize lnk */
869     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
870     current_space->array           += cnzi;
871     current_space->local_used      += cnzi;
872     current_space->local_remaining -= cnzi;
873 
874     ci[i+1] = ci[i] + cnzi;
875   }
876 
877 
878   /* Column indices are in the list of free space.
879      Allocate array cj, copy column indices to cj, and destroy list of free space */
880   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
881   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
882   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
883 
884   /* Allocate space for ca */
885   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
886   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
887 
888   /* put together the new symbolic matrix */
889   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
890   (*C)->rmap->bs = A->cmap->bs;
891   (*C)->cmap->bs = B->cmap->bs;
892 
893   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
894   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
895   c = (Mat_SeqAIJ *)((*C)->data);
896   c->free_a   = PETSC_TRUE;
897   c->free_ij  = PETSC_TRUE;
898   c->nonew    = 0;
899 
900   /* set MatInfo */
901   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
902   if (afill < 1.0) afill = 1.0;
903   c->maxnz                     = ci[am];
904   c->nz                        = ci[am];
905   (*C)->info.mallocs           = nspacedouble;
906   (*C)->info.fill_ratio_given  = fill;
907   (*C)->info.fill_ratio_needed = afill;
908 
909 #if defined(PETSC_USE_INFO)
910   if (ci[am]) {
911     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
912     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
913   } else {
914     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
915   }
916 #endif
917 #endif
918   PetscFunctionReturn(0);
919 }
920 
921 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
922 #undef __FUNCT__
923 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
924 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
925 {
926   PetscErrorCode ierr;
927   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
928   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
929   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
930   PetscLogDouble flops=0.0;
931   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
932   Mat_MatMatTransMult *multtrans;
933   PetscContainer      container;
934 #if defined(USE_ARRAY)
935   MatScalar      *spdot;
936 #endif
937 
938   PetscFunctionBegin;
939   /* clear old values in C */
940   if (!c->a) {
941     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
942     c->a      = ca;
943     c->free_a = PETSC_TRUE;
944   } else {
945     ca =  c->a;
946   }
947   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
948 
949   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
950   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
951   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
952   if (multtrans->usecoloring) {
953     MatTransposeColoring  matcoloring = multtrans->matcoloring;
954     Mat                   Bt_dense;
955     PetscInt              m,n;
956     Mat C_dense = multtrans->ABt_den;
957 
958     Bt_dense = multtrans->Bt_den;
959     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
960 
961     /* Get Bt_dense by Apply MatTransposeColoring to B */
962     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
963 
964     /* C_dense = A*Bt_dense */
965     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
966 
967     /* Recover C from C_dense */
968     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
969     PetscFunctionReturn(0);
970   }
971 
972 #if defined(USE_ARRAY)
973   /* allocate an array for implementing sparse inner-product */
974   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
975   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
976 #endif
977 
978   for (i=0; i<cm; i++) {
979     anzi = ai[i+1] - ai[i];
980     acol = aj + ai[i];
981     aval = aa + ai[i];
982     cnzi = ci[i+1] - ci[i];
983     ccol = cj + ci[i];
984     cval = ca + ci[i];
985     for (j=0; j<cnzi; j++) {
986       brow = ccol[j];
987       bnzj = bi[brow+1] - bi[brow];
988       bcol = bj + bi[brow];
989       bval = ba + bi[brow];
990 
991       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
992 #if defined(USE_ARRAY)
993       /* put ba to spdot array */
994       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
995       /* c(i,j)=A[i,:]*B[j,:]^T */
996       for (nexta=0; nexta<anzi; nexta++) {
997         cval[j] += spdot[acol[nexta]]*aval[nexta];
998       }
999       /* zero spdot array */
1000       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
1001 #else
1002       nexta = 0; nextb = 0;
1003       while (nexta<anzi && nextb<bnzj) {
1004         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
1005         if (nexta == anzi) break;
1006         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
1007         if (nextb == bnzj) break;
1008         if (acol[nexta] == bcol[nextb]) {
1009           cval[j] += aval[nexta]*bval[nextb];
1010           nexta++; nextb++;
1011           flops += 2;
1012         }
1013       }
1014 #endif
1015     }
1016   }
1017   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1018   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1019   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1020 #if defined(USE_ARRAY)
1021   ierr = PetscFree(spdot);
1022 #endif
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 #undef __FUNCT__
1027 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
1028 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1029 {
1030   PetscErrorCode ierr;
1031 
1032   PetscFunctionBegin;
1033   if (scall == MAT_INITIAL_MATRIX) {
1034     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
1035   }
1036   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 #undef __FUNCT__
1041 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
1042 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1043 {
1044   PetscErrorCode ierr;
1045   Mat            At;
1046   PetscInt       *ati,*atj;
1047 
1048   PetscFunctionBegin;
1049   /* create symbolic At */
1050   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1051   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
1052   At->rmap->bs = A->cmap->bs;
1053   At->cmap->bs = B->cmap->bs;
1054 
1055   /* get symbolic C=At*B */
1056   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
1057 
1058   /* clean up */
1059   ierr = MatDestroy(&At);CHKERRQ(ierr);
1060   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 #undef __FUNCT__
1065 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
1066 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1067 {
1068   PetscErrorCode ierr;
1069   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1070   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1071   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1072   PetscLogDouble flops=0.0;
1073   MatScalar      *aa=a->a,*ba,*ca,*caj;
1074 
1075   PetscFunctionBegin;
1076   if (!c->a) {
1077     ierr = PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1078     c->a      = ca;
1079     c->free_a = PETSC_TRUE;
1080   } else {
1081     ca = c->a;
1082   }
1083   /* clear old values in C */
1084   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1085 
1086   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1087   for (i=0;i<am;i++) {
1088     bj   = b->j + bi[i];
1089     ba   = b->a + bi[i];
1090     bnzi = bi[i+1] - bi[i];
1091     anzi = ai[i+1] - ai[i];
1092     for (j=0; j<anzi; j++) {
1093       nextb = 0;
1094       crow  = *aj++;
1095       cjj   = cj + ci[crow];
1096       caj   = ca + ci[crow];
1097       /* perform sparse axpy operation.  Note cjj includes bj. */
1098       for (k=0; nextb<bnzi; k++) {
1099         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1100           caj[k] += (*aa)*(*(ba+nextb));
1101           nextb++;
1102         }
1103       }
1104       flops += 2*bnzi;
1105       aa++;
1106     }
1107   }
1108 
1109   /* Assemble the final matrix and clean up */
1110   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1111   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1112   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 EXTERN_C_BEGIN
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
1119 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1120 {
1121   PetscErrorCode ierr;
1122 
1123   PetscFunctionBegin;
1124   if (scall == MAT_INITIAL_MATRIX) {
1125     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
1126   }
1127   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
1128   PetscFunctionReturn(0);
1129 }
1130 EXTERN_C_END
1131 
1132 #undef __FUNCT__
1133 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
1134 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1135 {
1136   PetscErrorCode ierr;
1137 
1138   PetscFunctionBegin;
1139   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1140   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 #undef __FUNCT__
1145 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
1146 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1147 {
1148   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1149   PetscErrorCode ierr;
1150   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1151   MatScalar      *aa;
1152   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
1153   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
1154 
1155   PetscFunctionBegin;
1156   if (!cm || !cn) PetscFunctionReturn(0);
1157   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1158   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1159   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1160   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1161   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1162   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1163   for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1164     colam = col*am;
1165     for (i=0; i<am; i++) {        /* over rows of C in those columns */
1166       r1 = r2 = r3 = r4 = 0.0;
1167       n   = a->i[i+1] - a->i[i];
1168       aj  = a->j + a->i[i];
1169       aa  = a->a + a->i[i];
1170       for (j=0; j<n; j++) {
1171         r1 += (*aa)*b1[*aj];
1172         r2 += (*aa)*b2[*aj];
1173         r3 += (*aa)*b3[*aj];
1174         r4 += (*aa++)*b4[*aj++];
1175       }
1176       c[colam + i]       = r1;
1177       c[colam + am + i]  = r2;
1178       c[colam + am2 + i] = r3;
1179       c[colam + am3 + i] = r4;
1180     }
1181     b1 += bm4;
1182     b2 += bm4;
1183     b3 += bm4;
1184     b4 += bm4;
1185   }
1186   for (;col<cn; col++) {     /* over extra columns of C */
1187     for (i=0; i<am; i++) {  /* over rows of C in those columns */
1188       r1 = 0.0;
1189       n   = a->i[i+1] - a->i[i];
1190       aj  = a->j + a->i[i];
1191       aa  = a->a + a->i[i];
1192 
1193       for (j=0; j<n; j++) {
1194         r1 += (*aa++)*b1[*aj++];
1195       }
1196       c[col*am + i]     = r1;
1197     }
1198     b1 += bm;
1199   }
1200   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
1201   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1202   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1203   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1204   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 /*
1209    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1210 */
1211 #undef __FUNCT__
1212 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
1213 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1214 {
1215   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1216   PetscErrorCode ierr;
1217   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1218   MatScalar      *aa;
1219   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1220   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
1221 
1222   PetscFunctionBegin;
1223   if (!cm || !cn) PetscFunctionReturn(0);
1224   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
1225   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1226   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1227 
1228   if (a->compressedrow.use) { /* use compressed row format */
1229     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1230       colam = col*am;
1231       arm   = a->compressedrow.nrows;
1232       ii    = a->compressedrow.i;
1233       ridx  = a->compressedrow.rindex;
1234       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1235         r1 = r2 = r3 = r4 = 0.0;
1236         n   = ii[i+1] - ii[i];
1237         aj  = a->j + ii[i];
1238         aa  = a->a + ii[i];
1239         for (j=0; j<n; j++) {
1240           r1 += (*aa)*b1[*aj];
1241           r2 += (*aa)*b2[*aj];
1242           r3 += (*aa)*b3[*aj];
1243           r4 += (*aa++)*b4[*aj++];
1244         }
1245         c[colam       + ridx[i]] += r1;
1246         c[colam + am  + ridx[i]] += r2;
1247         c[colam + am2 + ridx[i]] += r3;
1248         c[colam + am3 + ridx[i]] += r4;
1249       }
1250       b1 += bm4;
1251       b2 += bm4;
1252       b3 += bm4;
1253       b4 += bm4;
1254     }
1255     for (;col<cn; col++) {     /* over extra columns of C */
1256       colam = col*am;
1257       arm   = a->compressedrow.nrows;
1258       ii    = a->compressedrow.i;
1259       ridx  = a->compressedrow.rindex;
1260       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1261         r1 = 0.0;
1262         n   = ii[i+1] - ii[i];
1263         aj  = a->j + ii[i];
1264         aa  = a->a + ii[i];
1265 
1266         for (j=0; j<n; j++) {
1267           r1 += (*aa++)*b1[*aj++];
1268         }
1269         c[colam + ridx[i]] += r1;
1270       }
1271       b1 += bm;
1272     }
1273   } else {
1274     for (col=0; col<cn-4; col += 4) {  /* over columns of C */
1275       colam = col*am;
1276       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1277         r1 = r2 = r3 = r4 = 0.0;
1278         n   = a->i[i+1] - a->i[i];
1279         aj  = a->j + a->i[i];
1280         aa  = a->a + a->i[i];
1281         for (j=0; j<n; j++) {
1282           r1 += (*aa)*b1[*aj];
1283           r2 += (*aa)*b2[*aj];
1284           r3 += (*aa)*b3[*aj];
1285           r4 += (*aa++)*b4[*aj++];
1286         }
1287         c[colam + i]       += r1;
1288         c[colam + am + i]  += r2;
1289         c[colam + am2 + i] += r3;
1290         c[colam + am3 + i] += r4;
1291       }
1292       b1 += bm4;
1293       b2 += bm4;
1294       b3 += bm4;
1295       b4 += bm4;
1296     }
1297     for (;col<cn; col++) {     /* over extra columns of C */
1298       colam = col*am;
1299       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1300         r1 = 0.0;
1301         n   = a->i[i+1] - a->i[i];
1302         aj  = a->j + a->i[i];
1303         aa  = a->a + a->i[i];
1304 
1305         for (j=0; j<n; j++) {
1306           r1 += (*aa++)*b1[*aj++];
1307         }
1308         c[colam + i]     += r1;
1309       }
1310       b1 += bm;
1311     }
1312   }
1313   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
1314   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
1315   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 #undef __FUNCT__
1320 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
1321 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1322 {
1323   PetscErrorCode ierr;
1324   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1325   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1326   PetscInt       *bi=b->i,*bj=b->j;
1327   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1328   MatScalar      *btval,*btval_den,*ba=b->a;
1329   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1330 
1331   PetscFunctionBegin;
1332   btval_den=btdense->v;
1333   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
1334   for (k=0; k<ncolors; k++) {
1335     ncolumns = coloring->ncolumns[k];
1336     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1337       col   = *(columns + colorforcol[k] + l);
1338       btcol = bj + bi[col];
1339       btval = ba + bi[col];
1340       anz   = bi[col+1] - bi[col];
1341       for (j=0; j<anz; j++) {
1342         brow            = btcol[j];
1343         btval_den[brow] = btval[j];
1344       }
1345     }
1346     btval_den += m;
1347   }
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 #undef __FUNCT__
1352 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
1353 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1354 {
1355   PetscErrorCode ierr;
1356   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1357   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1358   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1359   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1360 
1361   PetscFunctionBegin;
1362   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
1363   ierr = MatDenseGetArray(Cden,&ca_den);CHKERRQ(ierr);
1364   cp_den = ca_den;
1365   for (k=0; k<ncolors; k++) {
1366     nrows = matcoloring->nrows[k];
1367     row   = rows  + colorforrow[k];
1368     idx   = spidx + colorforrow[k];
1369     for (l=0; l<nrows; l++) {
1370       ca[idx[l]] = cp_den[row[l]];
1371     }
1372     cp_den += m;
1373   }
1374   ierr = MatDenseRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
1375   PetscFunctionReturn(0);
1376 }
1377 
1378 /*
1379  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1380  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1381  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1382  */
1383 #undef __FUNCT__
1384 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
1385 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1386 {
1387   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1388   PetscErrorCode ierr;
1389   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1390   PetscInt       nz = a->i[m],row,*jj,mr,col;
1391   PetscInt       *cspidx;
1392 
1393   PetscFunctionBegin;
1394   *nn = n;
1395   if (!ia) PetscFunctionReturn(0);
1396   if (symmetric) {
1397     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1398     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,(PetscInt**)ia,(PetscInt**)ja);CHKERRQ(ierr);
1399   } else {
1400     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
1401     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1402     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
1403     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
1404     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
1405     jj = a->j;
1406     for (i=0; i<nz; i++) {
1407       collengths[jj[i]]++;
1408     }
1409     cia[0] = oshift;
1410     for (i=0; i<n; i++) {
1411       cia[i+1] = cia[i] + collengths[i];
1412     }
1413     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
1414     jj   = a->j;
1415     for (row=0; row<m; row++) {
1416       mr = a->i[row+1] - a->i[row];
1417       for (i=0; i<mr; i++) {
1418         col = *jj++;
1419         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1420         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1421       }
1422     }
1423     ierr = PetscFree(collengths);CHKERRQ(ierr);
1424     *ia = cia; *ja = cja;
1425     *spidx = cspidx;
1426   }
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 #undef __FUNCT__
1431 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1432 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1433 {
1434   PetscErrorCode ierr;
1435 
1436   PetscFunctionBegin;
1437   if (!ia) PetscFunctionReturn(0);
1438 
1439   ierr = PetscFree(*ia);CHKERRQ(ierr);
1440   ierr = PetscFree(*ja);CHKERRQ(ierr);
1441   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 #undef __FUNCT__
1446 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1447 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1448 {
1449   PetscErrorCode ierr;
1450   PetscInt       i,n,nrows,N,j,k,m,ncols,col,cm;
1451   const PetscInt *is,*ci,*cj,*row_idx;
1452   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1453   IS             *isa;
1454   PetscBool      flg1,flg2;
1455   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1456   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1457   PetscInt       *colorforcol,*columns,*columns_i;
1458 
1459   PetscFunctionBegin;
1460   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1461 
1462   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1463   ierr = PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1464   ierr = PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1465   if (flg1 || flg2) {
1466     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1467   }
1468 
1469   N         = mat->cmap->N/bs;
1470   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1471   c->N      = mat->cmap->N/bs;
1472   c->m      = mat->rmap->N/bs;
1473   c->rstart = 0;
1474 
1475   c->ncolors = nis;
1476   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1477   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1478   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1479   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1480   colorforrow[0]    = 0;
1481   rows_i            = rows;
1482   columnsforspidx_i = columnsforspidx;
1483 
1484   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1485   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1486   colorforcol[0] = 0;
1487   columns_i      = columns;
1488 
1489   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);CHKERRQ(ierr); /* column-wise storage of mat */
1490 
1491   cm = c->m;
1492   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1493   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1494   for (i=0; i<nis; i++) {
1495     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1496     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1497     c->ncolumns[i] = n;
1498     if (n) {
1499       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1500     }
1501     colorforcol[i+1] = colorforcol[i] + n;
1502     columns_i       += n;
1503 
1504     /* fast, crude version requires O(N*N) work */
1505     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1506 
1507     /* loop over columns*/
1508     for (j=0; j<n; j++) {
1509       col     = is[j];
1510       row_idx = cj + ci[col];
1511       m       = ci[col+1] - ci[col];
1512       /* loop over columns marking them in rowhit */
1513       for (k=0; k<m; k++) {
1514         idxhit[*row_idx]   = spidx[ci[col] + k];
1515         rowhit[*row_idx++] = col + 1;
1516       }
1517     }
1518     /* count the number of hits */
1519     nrows = 0;
1520     for (j=0; j<cm; j++) {
1521       if (rowhit[j]) nrows++;
1522     }
1523     c->nrows[i]      = nrows;
1524     colorforrow[i+1] = colorforrow[i] + nrows;
1525 
1526     nrows       = 0;
1527     for (j=0; j<cm; j++) {
1528       if (rowhit[j]) {
1529         rows_i[nrows]            = j;
1530         columnsforspidx_i[nrows] = idxhit[j];
1531         nrows++;
1532       }
1533     }
1534     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1535     rows_i += nrows; columnsforspidx_i += nrows;
1536   }
1537   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);CHKERRQ(ierr);
1538   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1539   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1540 #if defined(PETSC_USE_DEBUG)
1541   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1542 #endif
1543 
1544   c->colorforrow     = colorforrow;
1545   c->rows            = rows;
1546   c->columnsforspidx = columnsforspidx;
1547   c->colorforcol     = colorforcol;
1548   c->columns         = columns;
1549   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552