xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 0d04baf89eca684847eb03c0d4ee6b8dfe3c620a)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
11 
12 EXTERN_C_BEGIN
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqAIJ"
15 PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
16 {
17   PetscErrorCode ierr;
18 
19   PetscFunctionBegin;
20   if (scall == MAT_INITIAL_MATRIX){
21     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
22     ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
23     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
24   }
25   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
26   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
27   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
28   PetscFunctionReturn(0);
29 }
30 EXTERN_C_END
31 
32 #undef __FUNCT__
33 #define __FUNCT__ "MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ"
34 PetscErrorCode MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ(PetscInt am,PetscInt *Ai,PetscInt *Aj,PetscInt bm,PetscInt bn,PetscInt *Bi,PetscInt *Bj,PetscReal fill,PetscInt *Ci[],PetscInt *Cj[],PetscInt *nspacedouble)
35 {
36   PetscErrorCode     ierr;
37   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
38   PetscInt           *ai=Ai,*aj=Aj,*bi=Bi,*bj=Bj,*bjj,*ci,*cj;
39   PetscInt           i,j,anzi,brow,bnzj,cnzi,nlnk,*lnk,ndouble=0;
40   PetscBT            lnkbt;
41 
42   PetscFunctionBegin;
43   /* Allocate ci array, arrays for fill computation and */
44   /* free space for accumulating nonzero column info */
45   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
46   ci[0] = 0;
47 
48   /* create and initialize a linked list */
49   nlnk = bn+1;
50   ierr = PetscLLCreate(bn,bn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
51 
52   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
53   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
54   current_space = free_space;
55 
56   /* Determine symbolic info for each row of the product: */
57   for (i=0; i<am; i++) {
58     anzi = ai[i+1] - ai[i];
59     cnzi = 0;
60     aj   = Aj + ai[i];
61     for (j=0; j<anzi; j++){
62       brow = aj[j];
63       bnzj = bi[brow+1] - bi[brow];
64       bjj  = bj + bi[brow];
65       /* add non-zero cols of B into the sorted linked list lnk */
66       ierr = PetscLLAdd(bnzj,bjj,bn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
67       cnzi += nlnk;
68     }
69 
70     /* If free space is not available, make more free space */
71     /* Double the amount of total space in the list */
72     if (current_space->local_remaining<cnzi) {
73       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
74       ndouble++;
75     }
76 
77     /* Copy data into free space, then initialize lnk */
78     ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
79     current_space->array           += cnzi;
80     current_space->local_used      += cnzi;
81     current_space->local_remaining -= cnzi;
82     ci[i+1] = ci[i] + cnzi;
83   }
84 
85   /* Column indices are in the list of free space */
86   /* Allocate space for cj, initialize cj, and */
87   /* destroy list of free space and other temporary array(s) */
88   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
89   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
90   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
91 
92   *Ci           = ci;
93   *Cj           = cj;
94   *nspacedouble = ndouble;
95   PetscFunctionReturn(0);
96 }
97 
98 #undef __FUNCT__
99 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqAIJ"
100 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
101 {
102   PetscErrorCode ierr;
103   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
104   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj;
105   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N,nspacedouble;
106   MatScalar      *ca;
107   PetscReal      afill;
108   PetscBool      dense_axpy; /* false: use sparse axpy; otherwise use dense axpy in MatMatMultNumeric_SeqAIJ_SeqAIJ() */
109 
110   PetscFunctionBegin;
111   /* Get ci and cj */
112   ierr = MatGetSymbolicMatMatMult_SeqAIJ_SeqAIJ(am,ai,aj,bm,bn,bi,bj,fill,&ci,&cj,&nspacedouble);CHKERRQ(ierr);
113 
114   /* Allocate space for ca */
115   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
116   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
117 
118   /* put together the new symbolic matrix */
119   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);CHKERRQ(ierr);
120 
121   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
122   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
123   c = (Mat_SeqAIJ *)((*C)->data);
124   c->free_a   = PETSC_TRUE;
125   c->free_ij  = PETSC_TRUE;
126   c->nonew    = 0;
127   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqAIJ;
128 
129   /* Determine which MatMatMultNumeric_SeqAIJ_SeqAIJ() to be used */
130   dense_axpy = PETSC_TRUE;
131   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmatmult_denseaxpy",&dense_axpy,PETSC_NULL);CHKERRQ(ierr);
132   if (dense_axpy){
133     ierr = PetscMalloc(bn*sizeof(PetscScalar),&c->matmult_abdense);CHKERRQ(ierr);
134     ierr = PetscMemzero(c->matmult_abdense,dense_axpy*bn*sizeof(PetscScalar));CHKERRQ(ierr);
135     (*C)->ops->matmultnumeric =  MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, takes additional dense_axpy*bn*sizeof(PetscScalar) space */
136   } else { /* slower, but use less memory */
137     (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_SparseAxpy; /* slower, less memory */
138   }
139 
140   /* set MatInfo */
141   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
142   if (afill < 1.0) afill = 1.0;
143   c->maxnz                     = ci[am];
144   c->nz                        = ci[am];
145   (*C)->info.mallocs           = nspacedouble;
146   (*C)->info.fill_ratio_given  = fill;
147   (*C)->info.fill_ratio_needed = afill;
148 
149 #if defined(PETSC_USE_INFO)
150   if (ci[am]) {
151     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
152     ierr = PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
153   } else {
154     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
155   }
156 #endif
157   PetscFunctionReturn(0);
158 }
159 
160 #undef __FUNCT__
161 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ"
162 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
163 {
164   PetscErrorCode ierr;
165   PetscLogDouble flops=0.0;
166   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
167   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
168   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
169   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
170   PetscInt       am=A->rmap->n,cm=C->rmap->n;
171   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
172   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
173   PetscScalar    *ab_dense=c->matmult_abdense;
174 
175   PetscFunctionBegin;
176   /* clean old values in C */
177   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
178   /* Traverse A row-wise. */
179   /* Build the ith row in C by summing over nonzero columns in A, */
180   /* the rows of B corresponding to nonzeros of A. */
181   for (i=0; i<am; i++) {
182     anzi = ai[i+1] - ai[i];
183     for (j=0; j<anzi; j++) {
184       brow = aj[j];
185       bnzi = bi[brow+1] - bi[brow];
186       bjj  = bj + bi[brow];
187       baj  = ba + bi[brow];
188       /* perform dense axpy */
189       valtmp = aa[j];
190       for (k=0; k<bnzi; k++) {
191         ab_dense[bjj[k]] += valtmp*baj[k];
192       }
193       flops += 2*bnzi;
194     }
195     aj += anzi; aa += anzi;
196 
197     cnzi = ci[i+1] - ci[i];
198     for (k=0; k<cnzi; k++) {
199       ca[k]          += ab_dense[cj[k]];
200       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
201     }
202     flops += cnzi;
203     cj += cnzi; ca += cnzi;
204   }
205   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
206   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
207   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
208   C->ops->matmultnumeric =  MatMatMultNumeric_SeqAIJ_SeqAIJ;
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqAIJ_SparseAxpy"
214 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_SparseAxpy(Mat A,Mat B,Mat C)
215 {
216   PetscErrorCode ierr;
217   PetscLogDouble flops=0.0;
218   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
219   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
220   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
221   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
222   PetscInt       am=A->rmap->N,cm=C->rmap->N;
223   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
224   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
225   PetscInt       nextb;
226 
227   PetscFunctionBegin;
228   /* clean old values in C */
229   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
230   /* Traverse A row-wise. */
231   /* Build the ith row in C by summing over nonzero columns in A, */
232   /* the rows of B corresponding to nonzeros of A. */
233   for (i=0;i<am;i++) {
234     anzi = ai[i+1] - ai[i];
235     cnzi = ci[i+1] - ci[i];
236     for (j=0;j<anzi;j++) {
237       brow = aj[j];
238       bnzi = bi[brow+1] - bi[brow];
239       bjj  = bj + bi[brow];
240       baj  = ba + bi[brow];
241       /* perform sparse axpy */
242       valtmp = aa[j];
243       nextb  = 0;
244       for (k=0; nextb<bnzi; k++) {
245         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
246           ca[k] += valtmp*baj[nextb++];
247         }
248       }
249       flops += 2*bnzi;
250     }
251     aj += anzi; aa += anzi;
252     cj += cnzi; ca += cnzi;
253   }
254 
255   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
256   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
257   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
261 /* This routine is not used. Should be removed! */
262 #undef __FUNCT__
263 #define __FUNCT__ "MatMatTransposeMult_SeqAIJ_SeqAIJ"
264 PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
265 {
266   PetscErrorCode ierr;
267 
268   PetscFunctionBegin;
269   if (scall == MAT_INITIAL_MATRIX){
270     ierr = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
271   }
272   ierr = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "PetscContainerDestroy_Mat_MatMatTransMult"
278 PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
279 {
280   PetscErrorCode      ierr;
281   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
282 
283   PetscFunctionBegin;
284   ierr = MatTransposeColoringDestroy(&multtrans->matcoloring);CHKERRQ(ierr);
285   ierr = MatDestroy(&multtrans->Bt_den);CHKERRQ(ierr);
286   ierr = MatDestroy(&multtrans->ABt_den);CHKERRQ(ierr);
287   ierr = PetscFree(multtrans);CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNCT__
292 #define __FUNCT__ "MatDestroy_SeqAIJ_MatMatMultTrans"
293 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
294 {
295   PetscErrorCode      ierr;
296   PetscContainer      container;
297   Mat_MatMatTransMult *multtrans=PETSC_NULL;
298 
299   PetscFunctionBegin;
300   ierr = PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
301   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
302   ierr = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
303   A->ops->destroy   = multtrans->destroy;
304   if (A->ops->destroy) {
305     ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
306   }
307   ierr = PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);CHKERRQ(ierr);
308   PetscFunctionReturn(0);
309 }
310 
311 #undef __FUNCT__
312 #define __FUNCT__ "MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ"
313 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
314 {
315   PetscErrorCode      ierr;
316   Mat                 Bt;
317   PetscInt            *bti,*btj;
318   Mat_MatMatTransMult *multtrans;
319   PetscContainer      container;
320   PetscLogDouble      t0,tf,etime2=0.0;
321 
322   PetscFunctionBegin;
323   ierr = PetscGetTime(&t0);CHKERRQ(ierr);
324    /* create symbolic Bt */
325   ierr = MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
326   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);CHKERRQ(ierr);
327 
328   /* get symbolic C=A*Bt */
329   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);CHKERRQ(ierr);
330 
331   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
332   ierr = PetscNew(Mat_MatMatTransMult,&multtrans);CHKERRQ(ierr);
333 
334   /* attach the supporting struct to C */
335   ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
336   ierr = PetscContainerSetPointer(container,multtrans);CHKERRQ(ierr);
337   ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);CHKERRQ(ierr);
338   ierr = PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);CHKERRQ(ierr);
339   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
340 
341   multtrans->usecoloring = PETSC_FALSE;
342   multtrans->destroy = (*C)->ops->destroy;
343   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
344 
345   ierr = PetscGetTime(&tf);CHKERRQ(ierr);
346   etime2 += tf - t0;
347 
348   ierr = PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);CHKERRQ(ierr);
349   if (multtrans->usecoloring){
350     /* Create MatTransposeColoring from symbolic C=A*B^T */
351     MatTransposeColoring matcoloring;
352     ISColoring           iscoloring;
353     Mat                  Bt_dense,C_dense;
354     PetscLogDouble       etime0=0.0,etime01=0.0,etime1=0.0;
355 
356     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
357     ierr = MatGetColoring(*C,MATCOLORINGLF,&iscoloring);CHKERRQ(ierr);
358     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
359     etime0 += tf - t0;
360 
361     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
362     ierr = MatTransposeColoringCreate(*C,iscoloring,&matcoloring);CHKERRQ(ierr);
363     multtrans->matcoloring = matcoloring;
364     ierr = ISColoringDestroy(&iscoloring);CHKERRQ(ierr);
365     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
366     etime01 += tf - t0;
367 
368     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
369     /* Create Bt_dense and C_dense = A*Bt_dense */
370     ierr = MatCreate(PETSC_COMM_SELF,&Bt_dense);CHKERRQ(ierr);
371     ierr = MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);CHKERRQ(ierr);
372     ierr = MatSetType(Bt_dense,MATSEQDENSE);CHKERRQ(ierr);
373     ierr = MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);CHKERRQ(ierr);
374     Bt_dense->assembled = PETSC_TRUE;
375     multtrans->Bt_den = Bt_dense;
376 
377     ierr = MatCreate(PETSC_COMM_SELF,&C_dense);CHKERRQ(ierr);
378     ierr = MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);CHKERRQ(ierr);
379     ierr = MatSetType(C_dense,MATSEQDENSE);CHKERRQ(ierr);
380     ierr = MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);CHKERRQ(ierr);
381     Bt_dense->assembled = PETSC_TRUE;
382     multtrans->ABt_den = C_dense;
383     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
384     etime1 += tf - t0;
385 
386 #if defined(PETSC_USE_INFO)
387     {
388     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
389     ierr = PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
390     ierr = PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
391     }
392 #endif
393   }
394   /* clean up */
395   ierr = MatDestroy(&Bt);CHKERRQ(ierr);
396   ierr = MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);CHKERRQ(ierr);
397 
398 
399 
400 #if defined(INEFFICIENT_ALGORITHM)
401   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
402   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
403   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
404   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
405   PetscInt           am=A->rmap->N,bm=B->rmap->N;
406   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
407   MatScalar          *ca;
408   PetscBT            lnkbt;
409   PetscReal          afill;
410 
411   /* Allocate row pointer array ci  */
412   ierr = PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
413   ci[0] = 0;
414 
415   /* Create and initialize a linked list for C columns */
416   nlnk = bm+1;
417   ierr = PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
418 
419   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
420   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);CHKERRQ(ierr);
421   current_space = free_space;
422 
423   /* Determine symbolic info for each row of the product A*B^T: */
424   for (i=0; i<am; i++) {
425     anzi = ai[i+1] - ai[i];
426     cnzi = 0;
427     acol = aj + ai[i];
428     for (j=0; j<bm; j++){
429       bnzj = bi[j+1] - bi[j];
430       bcol= bj + bi[j];
431       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
432       ka = 0; kb = 0;
433       while (ka < anzi && kb < bnzj){
434         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
435         if (ka == anzi) break;
436         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
437         if (kb == bnzj) break;
438         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
439           index[0] = j;
440           ierr = PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);CHKERRQ(ierr);
441           cnzi++;
442           break;
443         }
444       }
445     }
446 
447     /* If free space is not available, make more free space */
448     /* Double the amount of total space in the list */
449     if (current_space->local_remaining<cnzi) {
450       ierr = PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);CHKERRQ(ierr);
451       nspacedouble++;
452     }
453 
454     /* Copy data into free space, then initialize lnk */
455     ierr = PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
456     current_space->array           += cnzi;
457     current_space->local_used      += cnzi;
458     current_space->local_remaining -= cnzi;
459 
460     ci[i+1] = ci[i] + cnzi;
461   }
462 
463 
464   /* Column indices are in the list of free space.
465      Allocate array cj, copy column indices to cj, and destroy list of free space */
466   ierr = PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
467   ierr = PetscFreeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
468   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
469 
470   /* Allocate space for ca */
471   ierr = PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
472   ierr = PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));CHKERRQ(ierr);
473 
474   /* put together the new symbolic matrix */
475   ierr = MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);CHKERRQ(ierr);
476 
477   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
478   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
479   c = (Mat_SeqAIJ *)((*C)->data);
480   c->free_a   = PETSC_TRUE;
481   c->free_ij  = PETSC_TRUE;
482   c->nonew    = 0;
483 
484   /* set MatInfo */
485   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
486   if (afill < 1.0) afill = 1.0;
487   c->maxnz                     = ci[am];
488   c->nz                        = ci[am];
489   (*C)->info.mallocs           = nspacedouble;
490   (*C)->info.fill_ratio_given  = fill;
491   (*C)->info.fill_ratio_needed = afill;
492 
493 #if defined(PETSC_USE_INFO)
494   if (ci[am]) {
495     ierr = PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
496     ierr = PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
497   } else {
498     ierr = PetscInfo((*C),"Empty matrix product\n");CHKERRQ(ierr);
499   }
500 #endif
501 #endif
502   PetscFunctionReturn(0);
503 }
504 
505 /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
506 #undef __FUNCT__
507 #define __FUNCT__ "MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ"
508 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
509 {
510   PetscErrorCode ierr;
511   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
512   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
513   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
514   PetscLogDouble flops=0.0;
515   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca=c->a,*cval;
516   Mat_MatMatTransMult *multtrans;
517   PetscContainer      container;
518 #if defined(USE_ARRAY)
519   MatScalar      *spdot;
520 #endif
521 
522   PetscFunctionBegin;
523   ierr = PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);CHKERRQ(ierr);
524   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
525   ierr  = PetscContainerGetPointer(container,(void **)&multtrans);CHKERRQ(ierr);
526   if (multtrans->usecoloring){
527     MatTransposeColoring  matcoloring = multtrans->matcoloring;
528     Mat                   Bt_dense;
529     PetscInt              m,n;
530     PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
531     Mat C_dense = multtrans->ABt_den;
532 
533     Bt_dense = multtrans->Bt_den;
534     ierr = MatGetLocalSize(Bt_dense,&m,&n);CHKERRQ(ierr);
535 
536     /* Get Bt_dense by Apply MatTransposeColoring to B */
537     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
538     ierr = MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);CHKERRQ(ierr);
539     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
540     etime0 += tf - t0;
541 
542     /* C_dense = A*Bt_dense */
543     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
544     ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);CHKERRQ(ierr);
545     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
546     etime2 += tf - t0;
547 
548     /* Recover C from C_dense */
549     ierr = PetscGetTime(&t0);CHKERRQ(ierr);
550     ierr = MatTransColoringApplyDenToSp(matcoloring,C_dense,C);CHKERRQ(ierr);
551     ierr = PetscGetTime(&tf);CHKERRQ(ierr);
552     etime1 += tf - t0;
553 #if defined(PETSC_USE_INFO)
554     ierr = PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
555 #endif
556     PetscFunctionReturn(0);
557   }
558 
559 #if defined(USE_ARRAY)
560   /* allocate an array for implementing sparse inner-product */
561   ierr = PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);CHKERRQ(ierr);
562   ierr = PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));CHKERRQ(ierr);
563 #endif
564 
565   /* clear old values in C */
566   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
567 
568   for (i=0; i<cm; i++) {
569     anzi = ai[i+1] - ai[i];
570     acol = aj + ai[i];
571     aval = aa + ai[i];
572     cnzi = ci[i+1] - ci[i];
573     ccol = cj + ci[i];
574     cval = ca + ci[i];
575     for (j=0; j<cnzi; j++){
576       brow = ccol[j];
577       bnzj = bi[brow+1] - bi[brow];
578       bcol = bj + bi[brow];
579       bval = ba + bi[brow];
580 
581       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
582 #if defined(USE_ARRAY)
583       /* put ba to spdot array */
584       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
585       /* c(i,j)=A[i,:]*B[j,:]^T */
586       for (nexta=0; nexta<anzi; nexta++){
587         cval[j] += spdot[acol[nexta]]*aval[nexta];
588       }
589       /* zero spdot array */
590       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
591 #else
592       nexta = 0; nextb = 0;
593       while (nexta<anzi && nextb<bnzj){
594         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
595         if (nexta == anzi) break;
596         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
597         if (nextb == bnzj) break;
598         if (acol[nexta] == bcol[nextb]){
599           cval[j] += aval[nexta]*bval[nextb];
600           nexta++; nextb++;
601           flops += 2;
602         }
603       }
604 #endif
605     }
606   }
607   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
608   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
609   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
610 #if defined(USE_ARRAY)
611   ierr = PetscFree(spdot);
612 #endif
613   PetscFunctionReturn(0);
614 }
615 
616 #undef __FUNCT__
617 #define __FUNCT__ "MatTransposeMatMult_SeqAIJ_SeqAIJ"
618 PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   if (scall == MAT_INITIAL_MATRIX){
623     ierr = MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);CHKERRQ(ierr);
624   }
625   ierr = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 
629 #undef __FUNCT__
630 #define __FUNCT__ "MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ"
631 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
632 {
633   PetscErrorCode ierr;
634   Mat            At;
635   PetscInt       *ati,*atj;
636 
637   PetscFunctionBegin;
638   /* create symbolic At */
639   ierr = MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
640   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);CHKERRQ(ierr);
641 
642   /* get symbolic C=At*B */
643   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);CHKERRQ(ierr);
644 
645   /* clean up */
646   ierr = MatDestroy(&At);CHKERRQ(ierr);
647   ierr = MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);CHKERRQ(ierr);
648   PetscFunctionReturn(0);
649 }
650 
651 #undef __FUNCT__
652 #define __FUNCT__ "MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ"
653 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
654 {
655   PetscErrorCode ierr;
656   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
657   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
658   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
659   PetscLogDouble flops=0.0;
660   MatScalar      *aa=a->a,*ba,*ca=c->a,*caj;
661 
662   PetscFunctionBegin;
663   /* clear old values in C */
664   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
665 
666   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
667   for (i=0;i<am;i++) {
668     bj   = b->j + bi[i];
669     ba   = b->a + bi[i];
670     bnzi = bi[i+1] - bi[i];
671     anzi = ai[i+1] - ai[i];
672     for (j=0; j<anzi; j++) {
673       nextb = 0;
674       crow  = *aj++;
675       cjj   = cj + ci[crow];
676       caj   = ca + ci[crow];
677       /* perform sparse axpy operation.  Note cjj includes bj. */
678       for (k=0; nextb<bnzi; k++) {
679         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
680           caj[k] += (*aa)*(*(ba+nextb));
681           nextb++;
682         }
683       }
684       flops += 2*bnzi;
685       aa++;
686     }
687   }
688 
689   /* Assemble the final matrix and clean up */
690   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
691   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
692   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
693   PetscFunctionReturn(0);
694 }
695 
696 EXTERN_C_BEGIN
697 #undef __FUNCT__
698 #define __FUNCT__ "MatMatMult_SeqAIJ_SeqDense"
699 PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
700 {
701   PetscErrorCode ierr;
702 
703   PetscFunctionBegin;
704   if (scall == MAT_INITIAL_MATRIX){
705     ierr = MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
706   }
707   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 EXTERN_C_END
711 
712 #undef __FUNCT__
713 #define __FUNCT__ "MatMatMultSymbolic_SeqAIJ_SeqDense"
714 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
715 {
716   PetscErrorCode ierr;
717 
718   PetscFunctionBegin;
719   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
720   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
721   PetscFunctionReturn(0);
722 }
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatMatMultNumeric_SeqAIJ_SeqDense"
726 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
727 {
728   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
729   PetscErrorCode ierr;
730   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
731   MatScalar      *aa;
732   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
733   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;
734 
735   PetscFunctionBegin;
736   if (!cm || !cn) PetscFunctionReturn(0);
737   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
738   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
739   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
740   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
741   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
742   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
743   for (col=0; col<cn-4; col += 4){  /* over columns of C */
744     colam = col*am;
745     for (i=0; i<am; i++) {        /* over rows of C in those columns */
746       r1 = r2 = r3 = r4 = 0.0;
747       n   = a->i[i+1] - a->i[i];
748       aj  = a->j + a->i[i];
749       aa  = a->a + a->i[i];
750       for (j=0; j<n; j++) {
751         r1 += (*aa)*b1[*aj];
752         r2 += (*aa)*b2[*aj];
753         r3 += (*aa)*b3[*aj];
754         r4 += (*aa++)*b4[*aj++];
755       }
756       c[colam + i]       = r1;
757       c[colam + am + i]  = r2;
758       c[colam + am2 + i] = r3;
759       c[colam + am3 + i] = r4;
760     }
761     b1 += bm4;
762     b2 += bm4;
763     b3 += bm4;
764     b4 += bm4;
765   }
766   for (;col<cn; col++){     /* over extra columns of C */
767     for (i=0; i<am; i++) {  /* over rows of C in those columns */
768       r1 = 0.0;
769       n   = a->i[i+1] - a->i[i];
770       aj  = a->j + a->i[i];
771       aa  = a->a + a->i[i];
772 
773       for (j=0; j<n; j++) {
774         r1 += (*aa++)*b1[*aj++];
775       }
776       c[col*am + i]     = r1;
777     }
778     b1 += bm;
779   }
780   ierr = PetscLogFlops(cn*(2.0*a->nz));CHKERRQ(ierr);
781   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
782   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
783   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
784   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
785   PetscFunctionReturn(0);
786 }
787 
788 /*
789    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
790 */
791 #undef __FUNCT__
792 #define __FUNCT__ "MatMatMultNumericAdd_SeqAIJ_SeqDense"
793 PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
794 {
795   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
796   PetscErrorCode ierr;
797   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
798   MatScalar      *aa;
799   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
800   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;
801 
802   PetscFunctionBegin;
803   if (!cm || !cn) PetscFunctionReturn(0);
804   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
805   ierr = MatGetArray(C,&c);CHKERRQ(ierr);
806   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
807 
808   if (a->compressedrow.use){ /* use compressed row format */
809     for (col=0; col<cn-4; col += 4){  /* over columns of C */
810       colam = col*am;
811       arm   = a->compressedrow.nrows;
812       ii    = a->compressedrow.i;
813       ridx  = a->compressedrow.rindex;
814       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
815 	r1 = r2 = r3 = r4 = 0.0;
816 	n   = ii[i+1] - ii[i];
817 	aj  = a->j + ii[i];
818 	aa  = a->a + ii[i];
819 	for (j=0; j<n; j++) {
820 	  r1 += (*aa)*b1[*aj];
821 	  r2 += (*aa)*b2[*aj];
822 	  r3 += (*aa)*b3[*aj];
823 	  r4 += (*aa++)*b4[*aj++];
824 	}
825 	c[colam       + ridx[i]] += r1;
826 	c[colam + am  + ridx[i]] += r2;
827 	c[colam + am2 + ridx[i]] += r3;
828 	c[colam + am3 + ridx[i]] += r4;
829       }
830       b1 += bm4;
831       b2 += bm4;
832       b3 += bm4;
833       b4 += bm4;
834     }
835     for (;col<cn; col++){     /* over extra columns of C */
836       colam = col*am;
837       arm   = a->compressedrow.nrows;
838       ii    = a->compressedrow.i;
839       ridx  = a->compressedrow.rindex;
840       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
841 	r1 = 0.0;
842 	n   = ii[i+1] - ii[i];
843 	aj  = a->j + ii[i];
844 	aa  = a->a + ii[i];
845 
846 	for (j=0; j<n; j++) {
847 	  r1 += (*aa++)*b1[*aj++];
848 	}
849 	c[col*am + ridx[i]] += r1;
850       }
851       b1 += bm;
852     }
853   } else {
854     for (col=0; col<cn-4; col += 4){  /* over columns of C */
855       colam = col*am;
856       for (i=0; i<am; i++) {        /* over rows of C in those columns */
857 	r1 = r2 = r3 = r4 = 0.0;
858 	n   = a->i[i+1] - a->i[i];
859 	aj  = a->j + a->i[i];
860 	aa  = a->a + a->i[i];
861 	for (j=0; j<n; j++) {
862 	  r1 += (*aa)*b1[*aj];
863 	  r2 += (*aa)*b2[*aj];
864 	  r3 += (*aa)*b3[*aj];
865 	  r4 += (*aa++)*b4[*aj++];
866 	}
867 	c[colam + i]       += r1;
868 	c[colam + am + i]  += r2;
869 	c[colam + am2 + i] += r3;
870 	c[colam + am3 + i] += r4;
871       }
872       b1 += bm4;
873       b2 += bm4;
874       b3 += bm4;
875       b4 += bm4;
876     }
877     for (;col<cn; col++){     /* over extra columns of C */
878       for (i=0; i<am; i++) {  /* over rows of C in those columns */
879 	r1 = 0.0;
880 	n   = a->i[i+1] - a->i[i];
881 	aj  = a->j + a->i[i];
882 	aa  = a->a + a->i[i];
883 
884 	for (j=0; j<n; j++) {
885 	  r1 += (*aa++)*b1[*aj++];
886 	}
887 	c[col*am + i]     += r1;
888       }
889       b1 += bm;
890     }
891   }
892   ierr = PetscLogFlops(cn*2.0*a->nz);CHKERRQ(ierr);
893   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
894   ierr = MatRestoreArray(C,&c);CHKERRQ(ierr);
895   PetscFunctionReturn(0);
896 }
897 
898 #undef __FUNCT__
899 #define __FUNCT__ "MatTransColoringApplySpToDen_SeqAIJ"
900 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
901 {
902   PetscErrorCode ierr;
903   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
904   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
905   PetscInt       *bi=b->i,*bj=b->j;
906   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
907   MatScalar      *btval,*btval_den,*ba=b->a;
908   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
909 
910   PetscFunctionBegin;
911   btval_den=btdense->v;
912   ierr = PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));CHKERRQ(ierr);
913   for (k=0; k<ncolors; k++) {
914     ncolumns = coloring->ncolumns[k];
915     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
916       col   = *(columns + colorforcol[k] + l);
917       btcol = bj + bi[col];
918       btval = ba + bi[col];
919       anz   = bi[col+1] - bi[col];
920       for (j=0; j<anz; j++){
921         brow            = btcol[j];
922         btval_den[brow] = btval[j];
923       }
924     }
925     btval_den += m;
926   }
927   PetscFunctionReturn(0);
928 }
929 
930 #undef __FUNCT__
931 #define __FUNCT__ "MatTransColoringApplyDenToSp_SeqAIJ"
932 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
933 {
934   PetscErrorCode ierr;
935   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
936   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
937   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
938   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
939 
940   PetscFunctionBegin;
941   ierr = MatGetLocalSize(Csp,&m,PETSC_NULL);CHKERRQ(ierr);
942   ierr = MatGetArray(Cden,&ca_den);CHKERRQ(ierr);
943   cp_den = ca_den;
944   for (k=0; k<ncolors; k++) {
945     nrows = matcoloring->nrows[k];
946     row   = rows  + colorforrow[k];
947     idx   = spidx + colorforrow[k];
948     for (l=0; l<nrows; l++){
949       ca[idx[l]] = cp_den[row[l]];
950     }
951     cp_den += m;
952   }
953   ierr = MatRestoreArray(Cden,&ca_den);CHKERRQ(ierr);
954   PetscFunctionReturn(0);
955 }
956 
957 /*
958  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
959  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
960  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
961  */
962 #undef __FUNCT__
963 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Color"
964 PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
965 {
966   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
967   PetscErrorCode ierr;
968   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
969   PetscInt       nz = a->i[m],row,*jj,mr,col;
970   PetscInt       *cspidx;
971 
972   PetscFunctionBegin;
973   *nn = n;
974   if (!ia) PetscFunctionReturn(0);
975   if (symmetric) {
976     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
977     ierr = MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
978   } else {
979     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&collengths);CHKERRQ(ierr);
980     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
981     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&cia);CHKERRQ(ierr);
982     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cja);CHKERRQ(ierr);
983     ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);CHKERRQ(ierr);
984     jj = a->j;
985     for (i=0; i<nz; i++) {
986       collengths[jj[i]]++;
987     }
988     cia[0] = oshift;
989     for (i=0; i<n; i++) {
990       cia[i+1] = cia[i] + collengths[i];
991     }
992     ierr = PetscMemzero(collengths,n*sizeof(PetscInt));CHKERRQ(ierr);
993     jj   = a->j;
994     for (row=0; row<m; row++) {
995       mr = a->i[row+1] - a->i[row];
996       for (i=0; i<mr; i++) {
997         col = *jj++;
998         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
999         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1000       }
1001     }
1002     ierr = PetscFree(collengths);CHKERRQ(ierr);
1003     *ia = cia; *ja = cja;
1004     *spidx = cspidx;
1005   }
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 #undef __FUNCT__
1010 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Color"
1011 PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1012 {
1013   PetscErrorCode ierr;
1014 
1015   PetscFunctionBegin;
1016   if (!ia) PetscFunctionReturn(0);
1017 
1018   ierr = PetscFree(*ia);CHKERRQ(ierr);
1019   ierr = PetscFree(*ja);CHKERRQ(ierr);
1020   ierr = PetscFree(*spidx);CHKERRQ(ierr);
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 #undef __FUNCT__
1025 #define __FUNCT__ "MatTransposeColoringCreate_SeqAIJ"
1026 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1027 {
1028   PetscErrorCode ierr;
1029   PetscInt       i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1030   const PetscInt *is;
1031   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1032   IS             *isa;
1033   PetscBool      done;
1034   PetscBool      flg1,flg2;
1035   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1036   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1037   PetscInt       *colorforcol,*columns,*columns_i;
1038 
1039   PetscFunctionBegin;
1040   ierr = ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);CHKERRQ(ierr);
1041 
1042   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1043   ierr = PetscTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);CHKERRQ(ierr);
1044   ierr = PetscTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);CHKERRQ(ierr);
1045   if (flg1 || flg2) {
1046     ierr = MatGetBlockSize(mat,&bs);CHKERRQ(ierr);
1047   }
1048 
1049   N         = mat->cmap->N/bs;
1050   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1051   c->N      = mat->cmap->N/bs;
1052   c->m      = mat->rmap->N/bs;
1053   c->rstart = 0;
1054 
1055   c->ncolors = nis;
1056   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);CHKERRQ(ierr);
1057   ierr       = PetscMalloc(nis*sizeof(PetscInt),&c->nrows);CHKERRQ(ierr);
1058   ierr       = PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);CHKERRQ(ierr);
1059   ierr       = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);CHKERRQ(ierr);
1060   colorforrow[0]    = 0;
1061   rows_i            = rows;
1062   columnsforspidx_i = columnsforspidx;
1063 
1064   ierr = PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);CHKERRQ(ierr);
1065   ierr = PetscMalloc((N+1)*sizeof(PetscInt),&columns);CHKERRQ(ierr);
1066   colorforcol[0] = 0;
1067   columns_i      = columns;
1068 
1069   ierr = MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr); /* column-wise storage of mat */
1070   if (!done) SETERRQ1(((PetscObject)mat)->comm,PETSC_ERR_SUP,"MatGetColumnIJ() not supported for matrix type %s",((PetscObject)mat)->type_name);
1071 
1072   cm = c->m;
1073   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);CHKERRQ(ierr);
1074   ierr = PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);CHKERRQ(ierr);
1075   for (i=0; i<nis; i++) {
1076     ierr = ISGetLocalSize(isa[i],&n);CHKERRQ(ierr);
1077     ierr = ISGetIndices(isa[i],&is);CHKERRQ(ierr);
1078     c->ncolumns[i] = n;
1079     if (n) {
1080       ierr = PetscMemcpy(columns_i,is,n*sizeof(PetscInt));CHKERRQ(ierr);
1081     }
1082     colorforcol[i+1] = colorforcol[i] + n;
1083     columns_i       += n;
1084 
1085     /* fast, crude version requires O(N*N) work */
1086     ierr = PetscMemzero(rowhit,cm*sizeof(PetscInt));CHKERRQ(ierr);
1087 
1088     /* loop over columns*/
1089     for (j=0; j<n; j++) {
1090       col     = is[j];
1091       row_idx = cj + ci[col];
1092       m       = ci[col+1] - ci[col];
1093       /* loop over columns marking them in rowhit */
1094       for (k=0; k<m; k++) {
1095         idxhit[*row_idx]   = spidx[ci[col] + k];
1096         rowhit[*row_idx++] = col + 1;
1097       }
1098     }
1099     /* count the number of hits */
1100     nrows = 0;
1101     for (j=0; j<cm; j++) {
1102       if (rowhit[j]) nrows++;
1103     }
1104     c->nrows[i]      = nrows;
1105     colorforrow[i+1] = colorforrow[i] + nrows;
1106 
1107     nrows       = 0;
1108     for (j=0; j<cm; j++) {
1109       if (rowhit[j]) {
1110         rows_i[nrows]            = j;
1111         columnsforspidx_i[nrows] = idxhit[j];
1112         nrows++;
1113       }
1114     }
1115     ierr = ISRestoreIndices(isa[i],&is);CHKERRQ(ierr);
1116     rows_i += nrows; columnsforspidx_i += nrows;
1117   }
1118   ierr = MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,&done);CHKERRQ(ierr);
1119   ierr = PetscFree(rowhit);CHKERRQ(ierr);
1120   ierr = ISColoringRestoreIS(iscoloring,&isa);CHKERRQ(ierr);
1121 #if defined(PETSC_USE_DEBUG)
1122   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1123 #endif
1124 
1125   c->colorforrow     = colorforrow;
1126   c->rows            = rows;
1127   c->columnsforspidx = columnsforspidx;
1128   c->colorforcol     = colorforcol;
1129   c->columns         = columns;
1130   ierr = PetscFree(idxhit);CHKERRQ(ierr);
1131   PetscFunctionReturn(0);
1132 }
1133