xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 8b83055f287c8deb1a16325ae3faee24d0648b7a)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc-private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18 
19   PetscFunctionBegin;
20   if (scall == MAT_INITIAL_MATRIX) {
21     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
22     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
23     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
24   }
25   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
26   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
27   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
28   PetscFunctionReturn(0);
29 }
30 
31 #undef __FUNCT__
32 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
33 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
34 {
35   PetscErrorCode ierr;
36   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
37   Mat_PtAPMPI    *ptap = a->ptap;
38 
39   PetscFunctionBegin;
40   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
41   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
42   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
43   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
44   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
45   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
46   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
47   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
48   ierr = ptap->destroy(A);CHKERRQ(ierr);
49   ierr = PetscFree(ptap);CHKERRQ(ierr);
50   PetscFunctionReturn(0);
51 }
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
55 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
56 {
57   PetscErrorCode ierr;
58   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
59   Mat_PtAPMPI    *ptap = a->ptap;
60 
61   PetscFunctionBegin;
62   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
63 
64   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
65   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
66   PetscFunctionReturn(0);
67 }
68 
69 #undef __FUNCT__
70 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
71 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
72 {
73   PetscErrorCode ierr;
74   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
75   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
76   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
77   PetscInt       *adi=ad->i,*adj,*aoi=ao->i,*aoj;
78   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
79   Mat_SeqAIJ     *p_loc,*p_oth;
80   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
81   PetscScalar    *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
82   PetscInt       cm   =C->rmap->n,anz,pnz;
83   Mat_PtAPMPI    *ptap=c->ptap;
84   PetscInt       *api,*apj,*apJ,i,j,k,row;
85   PetscInt       cstart=C->cmap->rstart;
86   PetscInt       cdnz,conz,k0,k1;
87 
88   PetscFunctionBegin;
89   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
90   /*-----------------------------------------------------*/
91   /* update numerical values of P_oth and P_loc */
92   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
93   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
94 
95   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
96   /*----------------------------------------------------------*/
97   /* get data from symbolic products */
98   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
99   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
100   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
101   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
102 
103   /* get apa for storing dense row A[i,:]*P */
104   apa = ptap->apa;
105 
106   api = ptap->api;
107   apj = ptap->apj;
108   for (i=0; i<cm; i++) {
109     /* diagonal portion of A */
110     anz = adi[i+1] - adi[i];
111     adj = ad->j + adi[i];
112     ada = ad->a + adi[i];
113     for (j=0; j<anz; j++) {
114       row = adj[j];
115       pnz = pi_loc[row+1] - pi_loc[row];
116       pj  = pj_loc + pi_loc[row];
117       pa  = pa_loc + pi_loc[row];
118 
119       /* perform dense axpy */
120       valtmp = ada[j];
121       for (k=0; k<pnz; k++) {
122         apa[pj[k]] += valtmp*pa[k];
123       }
124       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
125     }
126 
127     /* off-diagonal portion of A */
128     anz = aoi[i+1] - aoi[i];
129     aoj = ao->j + aoi[i];
130     aoa = ao->a + aoi[i];
131     for (j=0; j<anz; j++) {
132       row = aoj[j];
133       pnz = pi_oth[row+1] - pi_oth[row];
134       pj  = pj_oth + pi_oth[row];
135       pa  = pa_oth + pi_oth[row];
136 
137       /* perform dense axpy */
138       valtmp = aoa[j];
139       for (k=0; k<pnz; k++) {
140         apa[pj[k]] += valtmp*pa[k];
141       }
142       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
143     }
144 
145     /* set values in C */
146     apJ  = apj + api[i];
147     cdnz = cd->i[i+1] - cd->i[i];
148     conz = co->i[i+1] - co->i[i];
149 
150     /* 1st off-diagoanl part of C */
151     ca = coa + co->i[i];
152     k  = 0;
153     for (k0=0; k0<conz; k0++) {
154       if (apJ[k] >= cstart) break;
155       ca[k0]      = apa[apJ[k]];
156       apa[apJ[k]] = 0.0;
157       k++;
158     }
159 
160     /* diagonal part of C */
161     ca = cda + cd->i[i];
162     for (k1=0; k1<cdnz; k1++) {
163       ca[k1]      = apa[apJ[k]];
164       apa[apJ[k]] = 0.0;
165       k++;
166     }
167 
168     /* 2nd off-diagoanl part of C */
169     ca = coa + co->i[i];
170     for (; k0<conz; k0++) {
171       ca[k0]      = apa[apJ[k]];
172       apa[apJ[k]] = 0.0;
173       k++;
174     }
175   }
176   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
177   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
178   PetscFunctionReturn(0);
179 }
180 
181 #undef __FUNCT__
182 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
183 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
184 {
185   PetscErrorCode     ierr;
186   MPI_Comm           comm;
187   Mat                Cmpi;
188   Mat_PtAPMPI        *ptap;
189   PetscFreeSpaceList free_space=NULL,current_space=NULL;
190   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
191   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
192   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
193   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
194   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
195   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
196   PetscBT            lnkbt;
197   PetscScalar        *apa;
198   PetscReal          afill;
199   PetscBool          scalable=PETSC_TRUE;
200   PetscInt           nlnk_max,armax,prmax;
201 
202   PetscFunctionBegin;
203   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
204   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
205     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
206   }
207 
208   ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
209   ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);CHKERRQ(ierr);
210   if (scalable) {
211     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);CHKERRQ(ierr);
212     PetscFunctionReturn(0);
213   }
214   ierr = PetscOptionsEnd();CHKERRQ(ierr);
215 
216   /* create struct Mat_PtAPMPI and attached it to C later */
217   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
218 
219   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
220   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
221 
222   /* get P_loc by taking all local rows of P */
223   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
224 
225   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
226   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
227   pi_loc = p_loc->i; pj_loc = p_loc->j;
228   pi_oth = p_oth->i; pj_oth = p_oth->j;
229 
230   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
231   /*-------------------------------------------------------------------*/
232   ierr      = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
233   ptap->api = api;
234   api[0]    = 0;
235 
236   /* create and initialize a linked list */
237   armax    = ad->rmax+ao->rmax;
238   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
239   nlnk_max = armax*prmax;
240   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
241   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
242 
243   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
244   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
245 
246   current_space = free_space;
247 
248   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
249   for (i=0; i<am; i++) {
250     /* diagonal portion of A */
251     nzi = adi[i+1] - adi[i];
252     for (j=0; j<nzi; j++) {
253       row  = *adj++;
254       pnz  = pi_loc[row+1] - pi_loc[row];
255       Jptr = pj_loc + pi_loc[row];
256       /* add non-zero cols of P into the sorted linked list lnk */
257       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
258     }
259     /* off-diagonal portion of A */
260     nzi = aoi[i+1] - aoi[i];
261     for (j=0; j<nzi; j++) {
262       row  = *aoj++;
263       pnz  = pi_oth[row+1] - pi_oth[row];
264       Jptr = pj_oth + pi_oth[row];
265       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
266     }
267 
268     apnz     = lnk[0];
269     api[i+1] = api[i] + apnz;
270 
271     /* if free space is not available, double the total space in the list */
272     if (current_space->local_remaining<apnz) {
273       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
274       nspacedouble++;
275     }
276 
277     /* Copy data into free space, then initialize lnk */
278     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
279     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
280 
281     current_space->array           += apnz;
282     current_space->local_used      += apnz;
283     current_space->local_remaining -= apnz;
284   }
285 
286   /* Allocate space for apj, initialize apj, and */
287   /* destroy list of free space and other temporary array(s) */
288   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
289   apj  = ptap->apj;
290   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
291   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
292 
293   /* malloc apa to store dense row A[i,:]*P */
294   ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
295   ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr);
296 
297   ptap->apa = apa;
298 
299   /* create and assemble symbolic parallel matrix Cmpi */
300   /*----------------------------------------------------*/
301   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
302   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
303   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
304 
305   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
306   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
307   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
308   for (i=0; i<am; i++) {
309     row  = i + rstart;
310     apnz = api[i+1] - api[i];
311     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
312     apj += apnz;
313   }
314   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
315   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
316 
317   ptap->destroy        = Cmpi->ops->destroy;
318   ptap->duplicate      = Cmpi->ops->duplicate;
319   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
320   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
321 
322   /* attach the supporting struct to Cmpi for reuse */
323   c       = (Mat_MPIAIJ*)Cmpi->data;
324   c->ptap = ptap;
325 
326   *C = Cmpi;
327 
328   /* set MatInfo */
329   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
330   if (afill < 1.0) afill = 1.0;
331   Cmpi->info.mallocs           = nspacedouble;
332   Cmpi->info.fill_ratio_given  = fill;
333   Cmpi->info.fill_ratio_needed = afill;
334 
335 #if defined(PETSC_USE_INFO)
336   if (api[am]) {
337     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
338     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
339   } else {
340     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
341   }
342 #endif
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
348 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   if (scall == MAT_INITIAL_MATRIX) {
354     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
355     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
356     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
357   }
358   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
359   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
360   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
361   PetscFunctionReturn(0);
362 }
363 
364 typedef struct {
365   Mat         workB;
366   PetscScalar *rvalues,*svalues;
367   MPI_Request *rwaits,*swaits;
368 } MPIAIJ_MPIDense;
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
372 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
373 {
374   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
375   PetscErrorCode  ierr;
376 
377   PetscFunctionBegin;
378   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
379   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
380   ierr = PetscFree(contents);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
386 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
387 {
388   PetscErrorCode         ierr;
389   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
390   PetscInt               nz   = aij->B->cmap->n;
391   PetscContainer         container;
392   MPIAIJ_MPIDense        *contents;
393   VecScatter             ctx   = aij->Mvctx;
394   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
395   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
396   PetscInt               m     = A->rmap->n,n=B->cmap->n;
397 
398   PetscFunctionBegin;
399   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
400   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
401   ierr = MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);CHKERRQ(ierr);
402   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
403   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
404   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
405   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
406 
407   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
408 
409   ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr);
410   /* Create work matrix used to store off processor rows of B needed for local product */
411   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
412   /* Create work arrays needed */
413   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
414                       B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
415                       from->n,MPI_Request,&contents->rwaits,
416                       to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr);
417 
418   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
419   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
420   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
421   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
422   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
426 #undef __FUNCT__
427 #define __FUNCT__ "MatMPIDenseScatter"
428 /*
429     Performs an efficient scatter on the rows of B needed by this process; this is
430     a modification of the VecScatterBegin_() routines.
431 */
432 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
433 {
434   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
435   PetscErrorCode         ierr;
436   PetscScalar            *b,*w,*svalues,*rvalues;
437   VecScatter             ctx   = aij->Mvctx;
438   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
439   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
440   PetscInt               i,j,k;
441   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
442   PetscMPIInt            *sprocs,*rprocs,nrecvs;
443   MPI_Request            *swaits,*rwaits;
444   MPI_Comm               comm;
445   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
446   MPI_Status             status;
447   MPIAIJ_MPIDense        *contents;
448   PetscContainer         container;
449   Mat                    workB;
450 
451   PetscFunctionBegin;
452   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
453   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
454   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
455   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
456 
457   workB = *outworkB = contents->workB;
458   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
459   sindices = to->indices;
460   sstarts  = to->starts;
461   sprocs   = to->procs;
462   swaits   = contents->swaits;
463   svalues  = contents->svalues;
464 
465   rindices = from->indices;
466   rstarts  = from->starts;
467   rprocs   = from->procs;
468   rwaits   = contents->rwaits;
469   rvalues  = contents->rvalues;
470 
471   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
472   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
473 
474   for (i=0; i<from->n; i++) {
475     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
476   }
477 
478   for (i=0; i<to->n; i++) {
479     /* pack a message at a time */
480     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
481       for (k=0; k<ncols; k++) {
482         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
483       }
484     }
485     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
486   }
487 
488   nrecvs = from->n;
489   while (nrecvs) {
490     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
491     nrecvs--;
492     /* unpack a message at a time */
493     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
494       for (k=0; k<ncols; k++) {
495         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
496       }
497     }
498   }
499   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
500 
501   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
502   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
503   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
504   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
508 
509 #undef __FUNCT__
510 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
511 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
512 {
513   PetscErrorCode ierr;
514   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
515   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
516   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
517   Mat            workB;
518 
519   PetscFunctionBegin;
520   /* diagonal block of A times all local rows of B*/
521   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
522 
523   /* get off processor parts of B needed to complete the product */
524   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
525 
526   /* off-diagonal block of A times nonlocal rows of B */
527   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
528   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
529   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 
533 #undef __FUNCT__
534 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
535 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
536 {
537   PetscErrorCode ierr;
538   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
539   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
540   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
541   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
542   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
543   Mat_SeqAIJ     *p_loc,*p_oth;
544   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
545   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
546   PetscInt       cm          = C->rmap->n,anz,pnz;
547   Mat_PtAPMPI    *ptap       = c->ptap;
548   PetscScalar    *apa_sparse = ptap->apa;
549   PetscInt       *api,*apj,*apJ,i,j,k,row;
550   PetscInt       cstart = C->cmap->rstart;
551   PetscInt       cdnz,conz,k0,k1,nextp;
552 
553   PetscFunctionBegin;
554   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
555   /*-----------------------------------------------------*/
556   /* update numerical values of P_oth and P_loc */
557   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
558   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
559 
560   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
561   /*----------------------------------------------------------*/
562   /* get data from symbolic products */
563   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
564   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
565   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
566   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
567 
568   api = ptap->api;
569   apj = ptap->apj;
570   for (i=0; i<cm; i++) {
571     apJ = apj + api[i];
572 
573     /* diagonal portion of A */
574     anz = adi[i+1] - adi[i];
575     adj = ad->j + adi[i];
576     ada = ad->a + adi[i];
577     for (j=0; j<anz; j++) {
578       row = adj[j];
579       pnz = pi_loc[row+1] - pi_loc[row];
580       pj  = pj_loc + pi_loc[row];
581       pa  = pa_loc + pi_loc[row];
582       /* perform sparse axpy */
583       valtmp = ada[j];
584       nextp  = 0;
585       for (k=0; nextp<pnz; k++) {
586         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
587           apa_sparse[k] += valtmp*pa[nextp++];
588         }
589       }
590       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
591     }
592 
593     /* off-diagonal portion of A */
594     anz = aoi[i+1] - aoi[i];
595     aoj = ao->j + aoi[i];
596     aoa = ao->a + aoi[i];
597     for (j=0; j<anz; j++) {
598       row = aoj[j];
599       pnz = pi_oth[row+1] - pi_oth[row];
600       pj  = pj_oth + pi_oth[row];
601       pa  = pa_oth + pi_oth[row];
602       /* perform sparse axpy */
603       valtmp = aoa[j];
604       nextp  = 0;
605       for (k=0; nextp<pnz; k++) {
606         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
607           apa_sparse[k] += valtmp*pa[nextp++];
608         }
609       }
610       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
611     }
612 
613     /* set values in C */
614     cdnz = cd->i[i+1] - cd->i[i];
615     conz = co->i[i+1] - co->i[i];
616 
617     /* 1st off-diagoanl part of C */
618     ca = coa + co->i[i];
619     k  = 0;
620     for (k0=0; k0<conz; k0++) {
621       if (apJ[k] >= cstart) break;
622       ca[k0]        = apa_sparse[k];
623       apa_sparse[k] = 0.0;
624       k++;
625     }
626 
627     /* diagonal part of C */
628     ca = cda + cd->i[i];
629     for (k1=0; k1<cdnz; k1++) {
630       ca[k1]        = apa_sparse[k];
631       apa_sparse[k] = 0.0;
632       k++;
633     }
634 
635     /* 2nd off-diagoanl part of C */
636     ca = coa + co->i[i];
637     for (; k0<conz; k0++) {
638       ca[k0]        = apa_sparse[k];
639       apa_sparse[k] = 0.0;
640       k++;
641     }
642   }
643   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
644   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
649 #undef __FUNCT__
650 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
651 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
652 {
653   PetscErrorCode     ierr;
654   MPI_Comm           comm;
655   Mat                Cmpi;
656   Mat_PtAPMPI        *ptap;
657   PetscFreeSpaceList free_space = NULL,current_space=NULL;
658   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
659   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
660   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
661   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
662   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
663   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
664   PetscInt           nlnk_max,armax,prmax;
665   PetscReal          afill;
666   PetscScalar        *apa;
667 
668   PetscFunctionBegin;
669   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
670   /* create struct Mat_PtAPMPI and attached it to C later */
671   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
672 
673   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
674   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
675 
676   /* get P_loc by taking all local rows of P */
677   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
678 
679   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
680   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
681   pi_loc = p_loc->i; pj_loc = p_loc->j;
682   pi_oth = p_oth->i; pj_oth = p_oth->j;
683 
684   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
685   /*-------------------------------------------------------------------*/
686   ierr      = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
687   ptap->api = api;
688   api[0]    = 0;
689 
690   /* create and initialize a linked list */
691   armax    = ad->rmax+ao->rmax;
692   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
693   nlnk_max = armax*prmax;
694   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
695   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
696 
697   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
698   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
699 
700   current_space = free_space;
701 
702   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
703   for (i=0; i<am; i++) {
704     /* diagonal portion of A */
705     nzi = adi[i+1] - adi[i];
706     for (j=0; j<nzi; j++) {
707       row  = *adj++;
708       pnz  = pi_loc[row+1] - pi_loc[row];
709       Jptr = pj_loc + pi_loc[row];
710       /* add non-zero cols of P into the sorted linked list lnk */
711       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
712     }
713     /* off-diagonal portion of A */
714     nzi = aoi[i+1] - aoi[i];
715     for (j=0; j<nzi; j++) {
716       row  = *aoj++;
717       pnz  = pi_oth[row+1] - pi_oth[row];
718       Jptr = pj_oth + pi_oth[row];
719       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
720     }
721 
722     apnz     = *lnk;
723     api[i+1] = api[i] + apnz;
724     if (apnz > apnz_max) apnz_max = apnz;
725 
726     /* if free space is not available, double the total space in the list */
727     if (current_space->local_remaining<apnz) {
728       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
729       nspacedouble++;
730     }
731 
732     /* Copy data into free space, then initialize lnk */
733     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
734     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
735 
736     current_space->array           += apnz;
737     current_space->local_used      += apnz;
738     current_space->local_remaining -= apnz;
739   }
740 
741   /* Allocate space for apj, initialize apj, and */
742   /* destroy list of free space and other temporary array(s) */
743   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
744   apj  = ptap->apj;
745   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
746   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
747 
748   /* create and assemble symbolic parallel matrix Cmpi */
749   /*----------------------------------------------------*/
750   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
751   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
752   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
753   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
754   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
755   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
756 
757   /* malloc apa for assembly Cmpi */
758   ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
759   ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr);
760 
761   ptap->apa = apa;
762   for (i=0; i<am; i++) {
763     row  = i + rstart;
764     apnz = api[i+1] - api[i];
765     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
766     apj += apnz;
767   }
768   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
769   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
770 
771   ptap->destroy             = Cmpi->ops->destroy;
772   ptap->duplicate           = Cmpi->ops->duplicate;
773   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
774   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
775   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
776 
777   /* attach the supporting struct to Cmpi for reuse */
778   c       = (Mat_MPIAIJ*)Cmpi->data;
779   c->ptap = ptap;
780 
781   *C = Cmpi;
782 
783   /* set MatInfo */
784   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
785   if (afill < 1.0) afill = 1.0;
786   Cmpi->info.mallocs           = nspacedouble;
787   Cmpi->info.fill_ratio_given  = fill;
788   Cmpi->info.fill_ratio_needed = afill;
789 
790 #if defined(PETSC_USE_INFO)
791   if (api[am]) {
792     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
793     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
794   } else {
795     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
796   }
797 #endif
798   PetscFunctionReturn(0);
799 }
800 
801 /*-------------------------------------------------------------------------*/
802 #undef __FUNCT__
803 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
804 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
805 {
806   PetscErrorCode ierr;
807   PetscBool      scalable=PETSC_TRUE,viamatmatmult=PETSC_FALSE;
808 
809   PetscFunctionBegin;
810   ierr = PetscOptionsBool("-mattransposematmult_viamatmatmult","Use R=Pt and C=R*A","",viamatmatmult,&viamatmatmult,NULL);CHKERRQ(ierr);
811   if (viamatmatmult) {
812     Mat         Pt;
813     Mat_PtAPMPI *ptap;
814     Mat_MPIAIJ  *c;
815     if (scall == MAT_INITIAL_MATRIX) {
816       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
817       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
818 
819       c        = (Mat_MPIAIJ*)(*C)->data;
820       ptap     = c->ptap;
821       ptap->Pt = Pt;
822     } else if (scall == MAT_REUSE_MATRIX) {
823       c    = (Mat_MPIAIJ*)(*C)->data;
824       ptap = c->ptap;
825       Pt   = ptap->Pt;
826       ierr = MatTranspose(P,scall,&Pt);CHKERRQ(ierr);
827       ierr = MatMatMult(Pt,A,scall,fill,C);CHKERRQ(ierr);
828     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not supported");
829     PetscFunctionReturn(0);
830   }
831 
832   if (scall == MAT_INITIAL_MATRIX) {
833     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
834     ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,NULL);CHKERRQ(ierr);
835     if  (scalable) {
836       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr);
837     } else {
838       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
839     }
840     ierr = PetscOptionsEnd();CHKERRQ(ierr);
841   }
842   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
843   PetscFunctionReturn(0);
844 }
845 
846 #undef __FUNCT__
847 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
848 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
849 {
850   PetscErrorCode      ierr;
851   Mat_Merge_SeqsToMPI *merge;
852   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
853   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
854   Mat_PtAPMPI         *ptap;
855   PetscInt            *adj,*aJ;
856   PetscInt            i,j,k,anz,pnz,row,*cj;
857   MatScalar           *ada,*aval,*ca,valtmp;
858   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
859   MPI_Comm            comm;
860   PetscMPIInt         size,rank,taga,*len_s;
861   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
862   PetscInt            **buf_ri,**buf_rj;
863   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
864   MPI_Request         *s_waits,*r_waits;
865   MPI_Status          *status;
866   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
867   PetscInt            *ai,*aj,*coi,*coj;
868   PetscInt            *poJ,*pdJ;
869   Mat                 A_loc;
870   Mat_SeqAIJ          *a_loc;
871 
872   PetscFunctionBegin;
873   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
874   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
875   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
876 
877   ptap  = c->ptap;
878   merge = ptap->merge;
879 
880   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
881   /*--------------------------------------------------------------*/
882   /* get data from symbolic products */
883   coi  = merge->coi; coj = merge->coj;
884   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
885   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
886 
887   bi     = merge->bi; bj = merge->bj;
888   owners = merge->rowmap->range;
889   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
890   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
891 
892   /* get A_loc by taking all local rows of A */
893   A_loc = ptap->A_loc;
894   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
895   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
896   ai    = a_loc->i;
897   aj    = a_loc->j;
898 
899   ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */
900   ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
901 
902   for (i=0; i<am; i++) {
903     /* 2-a) put A[i,:] to dense array aval */
904     anz = ai[i+1] - ai[i];
905     adj = aj + ai[i];
906     ada = a_loc->a + ai[i];
907     for (j=0; j<anz; j++) {
908       aval[adj[j]] = ada[j];
909     }
910 
911     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
912     /*--------------------------------------------------------------*/
913     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
914     pnz = po->i[i+1] - po->i[i];
915     poJ = po->j + po->i[i];
916     pA  = po->a + po->i[i];
917     for (j=0; j<pnz; j++) {
918       row = poJ[j];
919       cnz = coi[row+1] - coi[row];
920       cj  = coj + coi[row];
921       ca  = coa + coi[row];
922       /* perform dense axpy */
923       valtmp = pA[j];
924       for (k=0; k<cnz; k++) {
925         ca[k] += valtmp*aval[cj[k]];
926       }
927       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
928     }
929 
930     /* put the value into Cd (diagonal part) */
931     pnz = pd->i[i+1] - pd->i[i];
932     pdJ = pd->j + pd->i[i];
933     pA  = pd->a + pd->i[i];
934     for (j=0; j<pnz; j++) {
935       row = pdJ[j];
936       cnz = bi[row+1] - bi[row];
937       cj  = bj + bi[row];
938       ca  = ba + bi[row];
939       /* perform dense axpy */
940       valtmp = pA[j];
941       for (k=0; k<cnz; k++) {
942         ca[k] += valtmp*aval[cj[k]];
943       }
944       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
945     }
946 
947     /* zero the current row of Pt*A */
948     aJ = aj + ai[i];
949     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
950   }
951 
952   /* 3) send and recv matrix values coa */
953   /*------------------------------------*/
954   buf_ri = merge->buf_ri;
955   buf_rj = merge->buf_rj;
956   len_s  = merge->len_s;
957   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
958   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
959 
960   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
961   for (proc=0,k=0; proc<size; proc++) {
962     if (!len_s[proc]) continue;
963     i    = merge->owners_co[proc];
964     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
965     k++;
966   }
967   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
968   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
969 
970   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
971   ierr = PetscFree(r_waits);CHKERRQ(ierr);
972   ierr = PetscFree(coa);CHKERRQ(ierr);
973 
974   /* 4) insert local Cseq and received values into Cmpi */
975   /*----------------------------------------------------*/
976   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
977   for (k=0; k<merge->nrecv; k++) {
978     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
979     nrows       = *(buf_ri_k[k]);
980     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
981     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
982   }
983 
984   for (i=0; i<cm; i++) {
985     row  = owners[rank] + i; /* global row index of C_seq */
986     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
987     ba_i = ba + bi[i];
988     bnz  = bi[i+1] - bi[i];
989     /* add received vals into ba */
990     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
991       /* i-th row */
992       if (i == *nextrow[k]) {
993         cnz    = *(nextci[k]+1) - *nextci[k];
994         cj     = buf_rj[k] + *(nextci[k]);
995         ca     = abuf_r[k] + *(nextci[k]);
996         nextcj = 0;
997         for (j=0; nextcj<cnz; j++) {
998           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
999             ba_i[j] += ca[nextcj++];
1000           }
1001         }
1002         nextrow[k]++; nextci[k]++;
1003         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1004       }
1005     }
1006     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1007   }
1008   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1009   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1010 
1011   ierr = PetscFree(ba);CHKERRQ(ierr);
1012   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1013   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1014   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1015   ierr = PetscFree(aval);CHKERRQ(ierr);
1016   PetscFunctionReturn(0);
1017 }
1018 
1019 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1020 #undef __FUNCT__
1021 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1022 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1023 {
1024   PetscErrorCode      ierr;
1025   Mat                 Cmpi,A_loc,POt,PDt;
1026   Mat_PtAPMPI         *ptap;
1027   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1028   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1029   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1030   PetscInt            nnz;
1031   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1032   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1033   PetscBT             lnkbt;
1034   MPI_Comm            comm;
1035   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1036   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1037   PetscInt            len,proc,*dnz,*onz,*owners;
1038   PetscInt            nzi,*bi,*bj;
1039   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1040   MPI_Request         *swaits,*rwaits;
1041   MPI_Status          *sstatus,rstatus;
1042   Mat_Merge_SeqsToMPI *merge;
1043   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1044   PetscReal           afill  =1.0,afill_tmp;
1045   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1046   PetscScalar         *vals;
1047   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1048 
1049   PetscFunctionBegin;
1050   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1051   /* check if matrix local sizes are compatible */
1052   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1053     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1054   }
1055 
1056   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1057   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1058 
1059   /* create struct Mat_PtAPMPI and attached it to C later */
1060   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1061 
1062   /* get A_loc by taking all local rows of A */
1063   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1064 
1065   ptap->A_loc = A_loc;
1066 
1067   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1068   ai    = a_loc->i;
1069   aj    = a_loc->j;
1070 
1071   /* determine symbolic Co=(p->B)^T*A - send to others */
1072   /*----------------------------------------------------*/
1073   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1074   pdt  = (Mat_SeqAIJ*)PDt->data;
1075   pdti = pdt->i; pdtj = pdt->j;
1076 
1077   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1078   pot  = (Mat_SeqAIJ*)POt->data;
1079   poti = pot->i; potj = pot->j;
1080 
1081   /* then, compute symbolic Co = (p->B)^T*A */
1082   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1083   ierr   = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1084   coi[0] = 0;
1085 
1086   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1087   nnz           = fill*(poti[pon] + ai[am]);
1088   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1089   current_space = free_space;
1090 
1091   /* create and initialize a linked list */
1092   i     = PetscMax(pdt->rmax,pot->rmax);
1093   Crmax = i*a_loc->rmax*size;
1094   if (!Crmax || Crmax > aN) Crmax = aN;
1095   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1096 
1097   for (i=0; i<pon; i++) {
1098     pnz = poti[i+1] - poti[i];
1099     ptJ = potj + poti[i];
1100     for (j=0; j<pnz; j++) {
1101       row  = ptJ[j]; /* row of A_loc == col of Pot */
1102       anz  = ai[row+1] - ai[row];
1103       Jptr = aj + ai[row];
1104       /* add non-zero cols of AP into the sorted linked list lnk */
1105       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1106     }
1107     nnz = lnk[0];
1108 
1109     /* If free space is not available, double the total space in the list */
1110     if (current_space->local_remaining<nnz) {
1111       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1112       nspacedouble++;
1113     }
1114 
1115     /* Copy data into free space, and zero out denserows */
1116     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1117 
1118     current_space->array           += nnz;
1119     current_space->local_used      += nnz;
1120     current_space->local_remaining -= nnz;
1121 
1122     coi[i+1] = coi[i] + nnz;
1123   }
1124 
1125   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1126   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1127 
1128   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1129   if (afill_tmp > afill) afill = afill_tmp;
1130 
1131   /* send j-array (coj) of Co to other processors */
1132   /*----------------------------------------------*/
1133   /* determine row ownership */
1134   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1135   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1136 
1137   merge->rowmap->n  = pn;
1138   merge->rowmap->bs = 1;
1139 
1140   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1141   owners = merge->rowmap->range;
1142 
1143   /* determine the number of messages to send, their lengths */
1144   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1145   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1146   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1147 
1148   len_s        = merge->len_s;
1149   merge->nsend = 0;
1150 
1151   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1152   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1153 
1154   proc = 0;
1155   for (i=0; i<pon; i++) {
1156     while (prmap[i] >= owners[proc+1]) proc++;
1157     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1158     len_s[proc] += coi[i+1] - coi[i];
1159   }
1160 
1161   len          = 0; /* max length of buf_si[] */
1162   owners_co[0] = 0;
1163   for (proc=0; proc<size; proc++) {
1164     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1165     if (len_si[proc]) {
1166       merge->nsend++;
1167       len_si[proc] = 2*(len_si[proc] + 1);
1168       len         += len_si[proc];
1169     }
1170   }
1171 
1172   /* determine the number and length of messages to receive for coi and coj  */
1173   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1174   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1175 
1176   /* post the Irecv and Isend of coj */
1177   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1178   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1179   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1180   for (proc=0, k=0; proc<size; proc++) {
1181     if (!len_s[proc]) continue;
1182     i    = owners_co[proc];
1183     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1184     k++;
1185   }
1186 
1187   /* receives and sends of coj are complete */
1188   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1189   for (i=0; i<merge->nrecv; i++) {
1190     PetscMPIInt icompleted;
1191     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1192   }
1193   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1194   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1195 
1196   /* send and recv coi */
1197   /*-------------------*/
1198   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1199   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1200   ierr   = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1201   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1202   for (proc=0,k=0; proc<size; proc++) {
1203     if (!len_s[proc]) continue;
1204     /* form outgoing message for i-structure:
1205          buf_si[0]:                 nrows to be sent
1206                [1:nrows]:           row index (global)
1207                [nrows+1:2*nrows+1]: i-structure index
1208     */
1209     /*-------------------------------------------*/
1210     nrows       = len_si[proc]/2 - 1;
1211     buf_si_i    = buf_si + nrows+1;
1212     buf_si[0]   = nrows;
1213     buf_si_i[0] = 0;
1214     nrows       = 0;
1215     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1216       nzi               = coi[i+1] - coi[i];
1217       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1218       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1219       nrows++;
1220     }
1221     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1222     k++;
1223     buf_si += len_si[proc];
1224   }
1225   i = merge->nrecv;
1226   while (i--) {
1227     PetscMPIInt icompleted;
1228     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1229   }
1230   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1231   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1232   ierr = PetscFree(len_si);CHKERRQ(ierr);
1233   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1234   ierr = PetscFree(swaits);CHKERRQ(ierr);
1235   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1236   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1237 
1238   /* compute the local portion of C (mpi mat) */
1239   /*------------------------------------------*/
1240   /* allocate bi array and free space for accumulating nonzero column info */
1241   ierr  = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1242   bi[0] = 0;
1243 
1244   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1245   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1246   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1247   current_space = free_space;
1248 
1249   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1250   for (k=0; k<merge->nrecv; k++) {
1251     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1252     nrows       = *buf_ri_k[k];
1253     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1254     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1255   }
1256 
1257   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1258   rmax = 0;
1259   for (i=0; i<pn; i++) {
1260     /* add pdt[i,:]*AP into lnk */
1261     pnz = pdti[i+1] - pdti[i];
1262     ptJ = pdtj + pdti[i];
1263     for (j=0; j<pnz; j++) {
1264       row  = ptJ[j];  /* row of AP == col of Pt */
1265       anz  = ai[row+1] - ai[row];
1266       Jptr = aj + ai[row];
1267       /* add non-zero cols of AP into the sorted linked list lnk */
1268       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1269     }
1270 
1271     /* add received col data into lnk */
1272     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1273       if (i == *nextrow[k]) { /* i-th row */
1274         nzi  = *(nextci[k]+1) - *nextci[k];
1275         Jptr = buf_rj[k] + *nextci[k];
1276         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1277         nextrow[k]++; nextci[k]++;
1278       }
1279     }
1280     nnz = lnk[0];
1281 
1282     /* if free space is not available, make more free space */
1283     if (current_space->local_remaining<nnz) {
1284       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1285       nspacedouble++;
1286     }
1287     /* copy data into free space, then initialize lnk */
1288     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1289     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1290 
1291     current_space->array           += nnz;
1292     current_space->local_used      += nnz;
1293     current_space->local_remaining -= nnz;
1294 
1295     bi[i+1] = bi[i] + nnz;
1296     if (nnz > rmax) rmax = nnz;
1297   }
1298   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1299 
1300   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1301   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1302 
1303   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1304   if (afill_tmp > afill) afill = afill_tmp;
1305   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1306   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1307   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1308 
1309   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1310   /*----------------------------------------------------------------------------------*/
1311   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1312   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1313 
1314   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1315   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1316   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1317   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1318   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1319   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1320   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1321   for (i=0; i<pn; i++) {
1322     row  = i + rstart;
1323     nnz  = bi[i+1] - bi[i];
1324     Jptr = bj + bi[i];
1325     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1326   }
1327   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1328   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1329   ierr = PetscFree(vals);CHKERRQ(ierr);
1330 
1331   merge->bi        = bi;
1332   merge->bj        = bj;
1333   merge->coi       = coi;
1334   merge->coj       = coj;
1335   merge->buf_ri    = buf_ri;
1336   merge->buf_rj    = buf_rj;
1337   merge->owners_co = owners_co;
1338   merge->destroy   = Cmpi->ops->destroy;
1339   merge->duplicate = Cmpi->ops->duplicate;
1340 
1341   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1342   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1343 
1344   /* attach the supporting struct to Cmpi for reuse */
1345   c           = (Mat_MPIAIJ*)Cmpi->data;
1346   c->ptap     = ptap;
1347   ptap->api   = NULL;
1348   ptap->apj   = NULL;
1349   ptap->merge = merge;
1350   ptap->rmax  = rmax;
1351 
1352   *C = Cmpi;
1353 #if defined(PETSC_USE_INFO)
1354   if (bi[pn] != 0) {
1355     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1356     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1357   } else {
1358     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1359   }
1360 #endif
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
1366 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1367 {
1368   PetscErrorCode      ierr;
1369   Mat_Merge_SeqsToMPI *merge;
1370   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1371   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1372   Mat_PtAPMPI         *ptap;
1373   PetscInt            *adj;
1374   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1375   MatScalar           *ada,*ca,valtmp;
1376   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1377   MPI_Comm            comm;
1378   PetscMPIInt         size,rank,taga,*len_s;
1379   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1380   PetscInt            **buf_ri,**buf_rj;
1381   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1382   MPI_Request         *s_waits,*r_waits;
1383   MPI_Status          *status;
1384   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1385   PetscInt            *ai,*aj,*coi,*coj;
1386   PetscInt            *poJ,*pdJ;
1387   Mat                 A_loc;
1388   Mat_SeqAIJ          *a_loc;
1389 
1390   PetscFunctionBegin;
1391   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1392   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1393   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1394 
1395   ptap  = c->ptap;
1396   merge = ptap->merge;
1397 
1398   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1399   /*------------------------------------------*/
1400   /* get data from symbolic products */
1401   coi    = merge->coi; coj = merge->coj;
1402   ierr   = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
1403   ierr   = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
1404   bi     = merge->bi; bj = merge->bj;
1405   owners = merge->rowmap->range;
1406   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1407   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1408 
1409   /* get A_loc by taking all local rows of A */
1410   A_loc = ptap->A_loc;
1411   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1412   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1413   ai    = a_loc->i;
1414   aj    = a_loc->j;
1415 
1416   for (i=0; i<am; i++) {
1417     anz = ai[i+1] - ai[i];
1418     adj = aj + ai[i];
1419     ada = a_loc->a + ai[i];
1420 
1421     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1422     /*-------------------------------------------------------------*/
1423     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1424     pnz = po->i[i+1] - po->i[i];
1425     poJ = po->j + po->i[i];
1426     pA  = po->a + po->i[i];
1427     for (j=0; j<pnz; j++) {
1428       row = poJ[j];
1429       cj  = coj + coi[row];
1430       ca  = coa + coi[row];
1431       /* perform sparse axpy */
1432       nexta  = 0;
1433       valtmp = pA[j];
1434       for (k=0; nexta<anz; k++) {
1435         if (cj[k] == adj[nexta]) {
1436           ca[k] += valtmp*ada[nexta];
1437           nexta++;
1438         }
1439       }
1440       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1441     }
1442 
1443     /* put the value into Cd (diagonal part) */
1444     pnz = pd->i[i+1] - pd->i[i];
1445     pdJ = pd->j + pd->i[i];
1446     pA  = pd->a + pd->i[i];
1447     for (j=0; j<pnz; j++) {
1448       row = pdJ[j];
1449       cj  = bj + bi[row];
1450       ca  = ba + bi[row];
1451       /* perform sparse axpy */
1452       nexta  = 0;
1453       valtmp = pA[j];
1454       for (k=0; nexta<anz; k++) {
1455         if (cj[k] == adj[nexta]) {
1456           ca[k] += valtmp*ada[nexta];
1457           nexta++;
1458         }
1459       }
1460       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1461     }
1462   }
1463 
1464   /* 3) send and recv matrix values coa */
1465   /*------------------------------------*/
1466   buf_ri = merge->buf_ri;
1467   buf_rj = merge->buf_rj;
1468   len_s  = merge->len_s;
1469   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1470   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1471 
1472   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
1473   for (proc=0,k=0; proc<size; proc++) {
1474     if (!len_s[proc]) continue;
1475     i    = merge->owners_co[proc];
1476     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1477     k++;
1478   }
1479   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1480   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1481 
1482   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1483   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1484   ierr = PetscFree(coa);CHKERRQ(ierr);
1485 
1486   /* 4) insert local Cseq and received values into Cmpi */
1487   /*----------------------------------------------------*/
1488   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1489   for (k=0; k<merge->nrecv; k++) {
1490     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1491     nrows       = *(buf_ri_k[k]);
1492     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1493     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1494   }
1495 
1496   for (i=0; i<cm; i++) {
1497     row  = owners[rank] + i; /* global row index of C_seq */
1498     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1499     ba_i = ba + bi[i];
1500     bnz  = bi[i+1] - bi[i];
1501     /* add received vals into ba */
1502     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1503       /* i-th row */
1504       if (i == *nextrow[k]) {
1505         cnz    = *(nextci[k]+1) - *nextci[k];
1506         cj     = buf_rj[k] + *(nextci[k]);
1507         ca     = abuf_r[k] + *(nextci[k]);
1508         nextcj = 0;
1509         for (j=0; nextcj<cnz; j++) {
1510           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1511             ba_i[j] += ca[nextcj++];
1512           }
1513         }
1514         nextrow[k]++; nextci[k]++;
1515         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1516       }
1517     }
1518     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1519   }
1520   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1521   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1522 
1523   ierr = PetscFree(ba);CHKERRQ(ierr);
1524   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1525   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1526   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1531 #undef __FUNCT__
1532 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
1533 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1534 {
1535   PetscErrorCode      ierr;
1536   Mat                 Cmpi,A_loc,POt,PDt;
1537   Mat_PtAPMPI         *ptap;
1538   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1539   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1540   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1541   PetscInt            nnz;
1542   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1543   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1544   MPI_Comm            comm;
1545   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1546   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1547   PetscInt            len,proc,*dnz,*onz,*owners;
1548   PetscInt            nzi,*bi,*bj;
1549   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1550   MPI_Request         *swaits,*rwaits;
1551   MPI_Status          *sstatus,rstatus;
1552   Mat_Merge_SeqsToMPI *merge;
1553   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1554   PetscReal           afill  =1.0,afill_tmp;
1555   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1556   PetscScalar         *vals;
1557   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1558 
1559   PetscFunctionBegin;
1560   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1561   /* check if matrix local sizes are compatible */
1562   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1563     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1564   }
1565 
1566   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1567   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1568 
1569   /* create struct Mat_PtAPMPI and attached it to C later */
1570   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1571 
1572   /* get A_loc by taking all local rows of A */
1573   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1574 
1575   ptap->A_loc = A_loc;
1576   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1577   ai          = a_loc->i;
1578   aj          = a_loc->j;
1579 
1580   /* determine symbolic Co=(p->B)^T*A - send to others */
1581   /*----------------------------------------------------*/
1582   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1583   pdt  = (Mat_SeqAIJ*)PDt->data;
1584   pdti = pdt->i; pdtj = pdt->j;
1585 
1586   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1587   pot  = (Mat_SeqAIJ*)POt->data;
1588   poti = pot->i; potj = pot->j;
1589 
1590   /* then, compute symbolic Co = (p->B)^T*A */
1591   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1592                          >= (num of nonzero rows of C_seq) - pn */
1593   ierr   = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1594   coi[0] = 0;
1595 
1596   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1597   nnz           = fill*(poti[pon] + ai[am]);
1598   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1599   current_space = free_space;
1600 
1601   /* create and initialize a linked list */
1602   i     = PetscMax(pdt->rmax,pot->rmax);
1603   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1604   if (!Crmax || Crmax > aN) Crmax = aN;
1605   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1606 
1607   for (i=0; i<pon; i++) {
1608     pnz = poti[i+1] - poti[i];
1609     ptJ = potj + poti[i];
1610     for (j=0; j<pnz; j++) {
1611       row  = ptJ[j]; /* row of A_loc == col of Pot */
1612       anz  = ai[row+1] - ai[row];
1613       Jptr = aj + ai[row];
1614       /* add non-zero cols of AP into the sorted linked list lnk */
1615       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1616     }
1617     nnz = lnk[0];
1618 
1619     /* If free space is not available, double the total space in the list */
1620     if (current_space->local_remaining<nnz) {
1621       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1622       nspacedouble++;
1623     }
1624 
1625     /* Copy data into free space, and zero out denserows */
1626     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1627 
1628     current_space->array           += nnz;
1629     current_space->local_used      += nnz;
1630     current_space->local_remaining -= nnz;
1631 
1632     coi[i+1] = coi[i] + nnz;
1633   }
1634 
1635   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1636   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1637 
1638   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1639   if (afill_tmp > afill) afill = afill_tmp;
1640 
1641   /* send j-array (coj) of Co to other processors */
1642   /*----------------------------------------------*/
1643   /* determine row ownership */
1644   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1645   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1646 
1647   merge->rowmap->n  = pn;
1648   merge->rowmap->bs = 1;
1649 
1650   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1651   owners = merge->rowmap->range;
1652 
1653   /* determine the number of messages to send, their lengths */
1654   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1655   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1656   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1657 
1658   len_s        = merge->len_s;
1659   merge->nsend = 0;
1660 
1661   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1662   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1663 
1664   proc = 0;
1665   for (i=0; i<pon; i++) {
1666     while (prmap[i] >= owners[proc+1]) proc++;
1667     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1668     len_s[proc] += coi[i+1] - coi[i];
1669   }
1670 
1671   len          = 0; /* max length of buf_si[] */
1672   owners_co[0] = 0;
1673   for (proc=0; proc<size; proc++) {
1674     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1675     if (len_si[proc]) {
1676       merge->nsend++;
1677       len_si[proc] = 2*(len_si[proc] + 1);
1678       len         += len_si[proc];
1679     }
1680   }
1681 
1682   /* determine the number and length of messages to receive for coi and coj  */
1683   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1684   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1685 
1686   /* post the Irecv and Isend of coj */
1687   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1688   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1689   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1690   for (proc=0, k=0; proc<size; proc++) {
1691     if (!len_s[proc]) continue;
1692     i    = owners_co[proc];
1693     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1694     k++;
1695   }
1696 
1697   /* receives and sends of coj are complete */
1698   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1699   for (i=0; i<merge->nrecv; i++) {
1700     PetscMPIInt icompleted;
1701     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1702   }
1703   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1704   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1705 
1706   /* send and recv coi */
1707   /*-------------------*/
1708   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1709   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1710   ierr   = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1711   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1712   for (proc=0,k=0; proc<size; proc++) {
1713     if (!len_s[proc]) continue;
1714     /* form outgoing message for i-structure:
1715          buf_si[0]:                 nrows to be sent
1716                [1:nrows]:           row index (global)
1717                [nrows+1:2*nrows+1]: i-structure index
1718     */
1719     /*-------------------------------------------*/
1720     nrows       = len_si[proc]/2 - 1;
1721     buf_si_i    = buf_si + nrows+1;
1722     buf_si[0]   = nrows;
1723     buf_si_i[0] = 0;
1724     nrows       = 0;
1725     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1726       nzi               = coi[i+1] - coi[i];
1727       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1728       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1729       nrows++;
1730     }
1731     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1732     k++;
1733     buf_si += len_si[proc];
1734   }
1735   i = merge->nrecv;
1736   while (i--) {
1737     PetscMPIInt icompleted;
1738     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1739   }
1740   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1741   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1742   ierr = PetscFree(len_si);CHKERRQ(ierr);
1743   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1744   ierr = PetscFree(swaits);CHKERRQ(ierr);
1745   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1746   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1747 
1748   /* compute the local portion of C (mpi mat) */
1749   /*------------------------------------------*/
1750   /* allocate bi array and free space for accumulating nonzero column info */
1751   ierr  = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1752   bi[0] = 0;
1753 
1754   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1755   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1756   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1757   current_space = free_space;
1758 
1759   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1760   for (k=0; k<merge->nrecv; k++) {
1761     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1762     nrows       = *buf_ri_k[k];
1763     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1764     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1765   }
1766 
1767   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1768   rmax = 0;
1769   for (i=0; i<pn; i++) {
1770     /* add pdt[i,:]*AP into lnk */
1771     pnz = pdti[i+1] - pdti[i];
1772     ptJ = pdtj + pdti[i];
1773     for (j=0; j<pnz; j++) {
1774       row  = ptJ[j];  /* row of AP == col of Pt */
1775       anz  = ai[row+1] - ai[row];
1776       Jptr = aj + ai[row];
1777       /* add non-zero cols of AP into the sorted linked list lnk */
1778       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1779     }
1780 
1781     /* add received col data into lnk */
1782     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1783       if (i == *nextrow[k]) { /* i-th row */
1784         nzi  = *(nextci[k]+1) - *nextci[k];
1785         Jptr = buf_rj[k] + *nextci[k];
1786         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1787         nextrow[k]++; nextci[k]++;
1788       }
1789     }
1790     nnz = lnk[0];
1791 
1792     /* if free space is not available, make more free space */
1793     if (current_space->local_remaining<nnz) {
1794       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1795       nspacedouble++;
1796     }
1797     /* copy data into free space, then initialize lnk */
1798     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1799     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1800 
1801     current_space->array           += nnz;
1802     current_space->local_used      += nnz;
1803     current_space->local_remaining -= nnz;
1804 
1805     bi[i+1] = bi[i] + nnz;
1806     if (nnz > rmax) rmax = nnz;
1807   }
1808   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1809 
1810   ierr      = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1811   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1812   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1813   if (afill_tmp > afill) afill = afill_tmp;
1814   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1815   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1816   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1817 
1818   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1819   /*----------------------------------------------------------------------------------*/
1820   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1821   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1822 
1823   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1824   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1825   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1826   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1827   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1828   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1829   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1830   for (i=0; i<pn; i++) {
1831     row  = i + rstart;
1832     nnz  = bi[i+1] - bi[i];
1833     Jptr = bj + bi[i];
1834     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1835   }
1836   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1837   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1838   ierr = PetscFree(vals);CHKERRQ(ierr);
1839 
1840   merge->bi        = bi;
1841   merge->bj        = bj;
1842   merge->coi       = coi;
1843   merge->coj       = coj;
1844   merge->buf_ri    = buf_ri;
1845   merge->buf_rj    = buf_rj;
1846   merge->owners_co = owners_co;
1847   merge->destroy   = Cmpi->ops->destroy;
1848   merge->duplicate = Cmpi->ops->duplicate;
1849 
1850   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1851   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1852 
1853   /* attach the supporting struct to Cmpi for reuse */
1854   c = (Mat_MPIAIJ*)Cmpi->data;
1855 
1856   c->ptap     = ptap;
1857   ptap->api   = NULL;
1858   ptap->apj   = NULL;
1859   ptap->merge = merge;
1860   ptap->rmax  = rmax;
1861   ptap->apa   = NULL;
1862 
1863   *C = Cmpi;
1864 #if defined(PETSC_USE_INFO)
1865   if (bi[pn] != 0) {
1866     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1867     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1868   } else {
1869     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1870   }
1871 #endif
1872   PetscFunctionReturn(0);
1873 }
1874