xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 8066bbec27a9b73c92d6586e89d7d2fd6ee7e4aa)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #if defined(PETSC_HAVE_HYPRE)
14 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
15 #endif
16 
17 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
18 {
19   PetscErrorCode ierr;
20 #if defined(PETSC_HAVE_HYPRE)
21   const char     *algTypes[3] = {"scalable","nonscalable","hypre"};
22   PetscInt       nalg = 3;
23 #else
24   const char     *algTypes[2] = {"scalable","nonscalable"};
25   PetscInt       nalg = 2;
26 #endif
27   PetscInt       alg = 1; /* set default algorithm */
28   MPI_Comm       comm;
29 
30   PetscFunctionBegin;
31   if (scall == MAT_INITIAL_MATRIX) {
32     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
33     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
34 
35     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
36     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[1],&alg,NULL);CHKERRQ(ierr);
37     ierr = PetscOptionsEnd();CHKERRQ(ierr);
38 
39     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
40     switch (alg) {
41     case 1:
42       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
43       break;
44 #if defined(PETSC_HAVE_HYPRE)
45     case 2:
46       ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
47       break;
48 #endif
49     default:
50       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
51       break;
52     }
53     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
54   }
55   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
56   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
57   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
62 {
63   PetscErrorCode ierr;
64   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
65   Mat_PtAPMPI    *ptap = a->ptap;
66 
67   PetscFunctionBegin;
68   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
69   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
70   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
71   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
72   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
73   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
74   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
75   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
76   ierr = ptap->destroy(A);CHKERRQ(ierr);
77   ierr = PetscFree(ptap);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
82 {
83   PetscErrorCode ierr;
84   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
85   Mat_PtAPMPI    *ptap = a->ptap;
86 
87   PetscFunctionBegin;
88   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
89 
90   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
91   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
96 {
97   PetscErrorCode ierr;
98   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
99   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
100   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
101   PetscScalar    *cda=cd->a,*coa=co->a;
102   Mat_SeqAIJ     *p_loc,*p_oth;
103   PetscScalar    *apa,*ca;
104   PetscInt       cm   =C->rmap->n;
105   Mat_PtAPMPI    *ptap=c->ptap;
106   PetscInt       *api,*apj,*apJ,i,k;
107   PetscInt       cstart=C->cmap->rstart;
108   PetscInt       cdnz,conz,k0,k1;
109   MPI_Comm       comm;
110   PetscMPIInt    size;
111 
112   PetscFunctionBegin;
113   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
114   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
115 
116   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
117   /*-----------------------------------------------------*/
118   /* update numerical values of P_oth and P_loc */
119   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
120   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
121 
122   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
123   /*----------------------------------------------------------*/
124   /* get data from symbolic products */
125   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
126   p_oth = NULL;
127   if (size >1) {
128     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
129   }
130 
131   /* get apa for storing dense row A[i,:]*P */
132   apa = ptap->apa;
133 
134   api = ptap->api;
135   apj = ptap->apj;
136   for (i=0; i<cm; i++) {
137     /* compute apa = A[i,:]*P */
138     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
139 
140     /* set values in C */
141     apJ  = apj + api[i];
142     cdnz = cd->i[i+1] - cd->i[i];
143     conz = co->i[i+1] - co->i[i];
144 
145     /* 1st off-diagoanl part of C */
146     ca = coa + co->i[i];
147     k  = 0;
148     for (k0=0; k0<conz; k0++) {
149       if (apJ[k] >= cstart) break;
150       ca[k0]      = apa[apJ[k]];
151       apa[apJ[k]] = 0.0;
152       k++;
153     }
154 
155     /* diagonal part of C */
156     ca = cda + cd->i[i];
157     for (k1=0; k1<cdnz; k1++) {
158       ca[k1]      = apa[apJ[k]];
159       apa[apJ[k]] = 0.0;
160       k++;
161     }
162 
163     /* 2nd off-diagoanl part of C */
164     ca = coa + co->i[i];
165     for (; k0<conz; k0++) {
166       ca[k0]      = apa[apJ[k]];
167       apa[apJ[k]] = 0.0;
168       k++;
169     }
170   }
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
177 {
178   PetscErrorCode     ierr;
179   MPI_Comm           comm;
180   PetscMPIInt        size;
181   Mat                Cmpi;
182   Mat_PtAPMPI        *ptap;
183   PetscFreeSpaceList free_space=NULL,current_space=NULL;
184   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
185   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
186   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
187   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
188   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
189   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
190   PetscBT            lnkbt;
191   PetscScalar        *apa;
192   PetscReal          afill;
193   PetscTable         ta;
194 
195   PetscFunctionBegin;
196   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
197   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
198 
199   /* create struct Mat_PtAPMPI and attached it to C later */
200   ierr = PetscNew(&ptap);CHKERRQ(ierr);
201 
202   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
203   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
204 
205   /* get P_loc by taking all local rows of P */
206   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
207 
208   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
209   pi_loc = p_loc->i; pj_loc = p_loc->j;
210   if (size > 1) {
211     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
212     pi_oth = p_oth->i; pj_oth = p_oth->j;
213   } else {
214     p_oth = NULL;
215     pi_oth = NULL; pj_oth = NULL;
216   }
217 
218   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
219   /*-------------------------------------------------------------------*/
220   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
221   ptap->api = api;
222   api[0]    = 0;
223 
224   /* create and initialize a linked list */
225   ierr = PetscTableCreate(pN,pN,&ta);CHKERRQ(ierr);
226   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
227   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
228   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
229   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
230 
231   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
232 
233   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
234   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
235   current_space = free_space;
236 
237   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
238   for (i=0; i<am; i++) {
239     /* diagonal portion of A */
240     nzi = adi[i+1] - adi[i];
241     for (j=0; j<nzi; j++) {
242       row  = *adj++;
243       pnz  = pi_loc[row+1] - pi_loc[row];
244       Jptr = pj_loc + pi_loc[row];
245       /* add non-zero cols of P into the sorted linked list lnk */
246       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
247     }
248     /* off-diagonal portion of A */
249     nzi = aoi[i+1] - aoi[i];
250     for (j=0; j<nzi; j++) {
251       row  = *aoj++;
252       pnz  = pi_oth[row+1] - pi_oth[row];
253       Jptr = pj_oth + pi_oth[row];
254       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
255     }
256 
257     apnz     = lnk[0];
258     api[i+1] = api[i] + apnz;
259 
260     /* if free space is not available, double the total space in the list */
261     if (current_space->local_remaining<apnz) {
262       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
263       nspacedouble++;
264     }
265 
266     /* Copy data into free space, then initialize lnk */
267     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
268     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
269 
270     current_space->array           += apnz;
271     current_space->local_used      += apnz;
272     current_space->local_remaining -= apnz;
273   }
274 
275   /* Allocate space for apj, initialize apj, and */
276   /* destroy list of free space and other temporary array(s) */
277   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
278   apj  = ptap->apj;
279   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
280   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
281 
282   /* malloc apa to store dense row A[i,:]*P */
283   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
284 
285   ptap->apa = apa;
286 
287   /* create and assemble symbolic parallel matrix Cmpi */
288   /*----------------------------------------------------*/
289   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
290   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
291   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
292 
293   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
294   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
295   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
296   for (i=0; i<am; i++) {
297     row  = i + rstart;
298     apnz = api[i+1] - api[i];
299     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
300     apj += apnz;
301   }
302   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
303   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
304 
305   ptap->destroy        = Cmpi->ops->destroy;
306   ptap->duplicate      = Cmpi->ops->duplicate;
307   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
308   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
309   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
310 
311   /* attach the supporting struct to Cmpi for reuse */
312   c       = (Mat_MPIAIJ*)Cmpi->data;
313   c->ptap = ptap;
314 
315   *C = Cmpi;
316 
317   /* set MatInfo */
318   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
319   if (afill < 1.0) afill = 1.0;
320   Cmpi->info.mallocs           = nspacedouble;
321   Cmpi->info.fill_ratio_given  = fill;
322   Cmpi->info.fill_ratio_needed = afill;
323 
324 #if defined(PETSC_USE_INFO)
325   if (api[am]) {
326     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
327     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
328   } else {
329     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
330   }
331 #endif
332   PetscFunctionReturn(0);
333 }
334 
335 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
336 {
337   PetscErrorCode ierr;
338 
339   PetscFunctionBegin;
340   if (scall == MAT_INITIAL_MATRIX) {
341     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
342     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
343     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
344   }
345   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
346   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
347   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 typedef struct {
352   Mat         workB;
353   PetscScalar *rvalues,*svalues;
354   MPI_Request *rwaits,*swaits;
355 } MPIAIJ_MPIDense;
356 
357 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
358 {
359   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
360   PetscErrorCode  ierr;
361 
362   PetscFunctionBegin;
363   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
364   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
365   ierr = PetscFree(contents);CHKERRQ(ierr);
366   PetscFunctionReturn(0);
367 }
368 
369 /*
370     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
371   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
372 
373   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
374 */
375 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
376 {
377   PetscErrorCode         ierr;
378   PetscBool              flg;
379   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
380   PetscInt               nz   = aij->B->cmap->n;
381   PetscContainer         container;
382   MPIAIJ_MPIDense        *contents;
383   VecScatter             ctx   = aij->Mvctx;
384   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
385   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
386 
387   PetscFunctionBegin;
388   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
389   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
390 
391   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
392   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
393   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
394 
395   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
396 
397   ierr = PetscNew(&contents);CHKERRQ(ierr);
398   /* Create work matrix used to store off processor rows of B needed for local product */
399   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
400   /* Create work arrays needed */
401   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
402                       B->cmap->N*to->starts[to->n],&contents->svalues,
403                       from->n,&contents->rwaits,
404                       to->n,&contents->swaits);CHKERRQ(ierr);
405 
406   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
407   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
408   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
409   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
410   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
411 
412   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
413   PetscFunctionReturn(0);
414 }
415 
416 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
417 {
418   PetscErrorCode         ierr;
419   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
420   PetscInt               nz   = aij->B->cmap->n;
421   PetscContainer         container;
422   MPIAIJ_MPIDense        *contents;
423   VecScatter             ctx   = aij->Mvctx;
424   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
425   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
426   PetscInt               m     = A->rmap->n,n=B->cmap->n;
427 
428   PetscFunctionBegin;
429   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
430   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
431   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
432   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
433   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
434   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
435   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
436 
437   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
438 
439   ierr = PetscNew(&contents);CHKERRQ(ierr);
440   /* Create work matrix used to store off processor rows of B needed for local product */
441   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
442   /* Create work arrays needed */
443   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
444                       B->cmap->N*to->starts[to->n],&contents->svalues,
445                       from->n,&contents->rwaits,
446                       to->n,&contents->swaits);CHKERRQ(ierr);
447 
448   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
449   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
450   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
451   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
452   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
453   PetscFunctionReturn(0);
454 }
455 
456 /*
457     Performs an efficient scatter on the rows of B needed by this process; this is
458     a modification of the VecScatterBegin_() routines.
459 */
460 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
461 {
462   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
463   PetscErrorCode         ierr;
464   PetscScalar            *b,*w,*svalues,*rvalues;
465   VecScatter             ctx   = aij->Mvctx;
466   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
467   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
468   PetscInt               i,j,k;
469   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
470   PetscMPIInt            *sprocs,*rprocs,nrecvs;
471   MPI_Request            *swaits,*rwaits;
472   MPI_Comm               comm;
473   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
474   MPI_Status             status;
475   MPIAIJ_MPIDense        *contents;
476   PetscContainer         container;
477   Mat                    workB;
478 
479   PetscFunctionBegin;
480   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
481   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
482   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
483   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
484 
485   workB = *outworkB = contents->workB;
486   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
487   sindices = to->indices;
488   sstarts  = to->starts;
489   sprocs   = to->procs;
490   swaits   = contents->swaits;
491   svalues  = contents->svalues;
492 
493   rindices = from->indices;
494   rstarts  = from->starts;
495   rprocs   = from->procs;
496   rwaits   = contents->rwaits;
497   rvalues  = contents->rvalues;
498 
499   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
500   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
501 
502   for (i=0; i<from->n; i++) {
503     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
504   }
505 
506   for (i=0; i<to->n; i++) {
507     /* pack a message at a time */
508     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
509       for (k=0; k<ncols; k++) {
510         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
511       }
512     }
513     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
514   }
515 
516   nrecvs = from->n;
517   while (nrecvs) {
518     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
519     nrecvs--;
520     /* unpack a message at a time */
521     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
522       for (k=0; k<ncols; k++) {
523         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
524       }
525     }
526   }
527   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
528 
529   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
530   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
531   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
532   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
536 
537 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
538 {
539   PetscErrorCode ierr;
540   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
541   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
542   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
543   Mat            workB;
544 
545   PetscFunctionBegin;
546   /* diagonal block of A times all local rows of B*/
547   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
548 
549   /* get off processor parts of B needed to complete the product */
550   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
551 
552   /* off-diagonal block of A times nonlocal rows of B */
553   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
554   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
555   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
556   PetscFunctionReturn(0);
557 }
558 
559 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
560 {
561   PetscErrorCode ierr;
562   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
563   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
564   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
565   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
566   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
567   Mat_SeqAIJ     *p_loc,*p_oth;
568   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
569   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
570   PetscInt       cm          = C->rmap->n,anz,pnz;
571   Mat_PtAPMPI    *ptap       = c->ptap;
572   PetscScalar    *apa_sparse = ptap->apa;
573   PetscInt       *api,*apj,*apJ,i,j,k,row;
574   PetscInt       cstart = C->cmap->rstart;
575   PetscInt       cdnz,conz,k0,k1,nextp;
576   MPI_Comm       comm;
577   PetscMPIInt    size;
578 
579   PetscFunctionBegin;
580   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
581   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
582 
583   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
584   /*-----------------------------------------------------*/
585   /* update numerical values of P_oth and P_loc */
586   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
587   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
588 
589   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
590   /*----------------------------------------------------------*/
591   /* get data from symbolic products */
592   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
593   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
594   if (size >1) {
595     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
596     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
597   } else {
598     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
599   }
600 
601   api = ptap->api;
602   apj = ptap->apj;
603   for (i=0; i<cm; i++) {
604     apJ = apj + api[i];
605 
606     /* diagonal portion of A */
607     anz = adi[i+1] - adi[i];
608     adj = ad->j + adi[i];
609     ada = ad->a + adi[i];
610     for (j=0; j<anz; j++) {
611       row = adj[j];
612       pnz = pi_loc[row+1] - pi_loc[row];
613       pj  = pj_loc + pi_loc[row];
614       pa  = pa_loc + pi_loc[row];
615       /* perform sparse axpy */
616       valtmp = ada[j];
617       nextp  = 0;
618       for (k=0; nextp<pnz; k++) {
619         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
620           apa_sparse[k] += valtmp*pa[nextp++];
621         }
622       }
623       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
624     }
625 
626     /* off-diagonal portion of A */
627     anz = aoi[i+1] - aoi[i];
628     aoj = ao->j + aoi[i];
629     aoa = ao->a + aoi[i];
630     for (j=0; j<anz; j++) {
631       row = aoj[j];
632       pnz = pi_oth[row+1] - pi_oth[row];
633       pj  = pj_oth + pi_oth[row];
634       pa  = pa_oth + pi_oth[row];
635       /* perform sparse axpy */
636       valtmp = aoa[j];
637       nextp  = 0;
638       for (k=0; nextp<pnz; k++) {
639         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
640           apa_sparse[k] += valtmp*pa[nextp++];
641         }
642       }
643       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
644     }
645 
646     /* set values in C */
647     cdnz = cd->i[i+1] - cd->i[i];
648     conz = co->i[i+1] - co->i[i];
649 
650     /* 1st off-diagoanl part of C */
651     ca = coa + co->i[i];
652     k  = 0;
653     for (k0=0; k0<conz; k0++) {
654       if (apJ[k] >= cstart) break;
655       ca[k0]        = apa_sparse[k];
656       apa_sparse[k] = 0.0;
657       k++;
658     }
659 
660     /* diagonal part of C */
661     ca = cda + cd->i[i];
662     for (k1=0; k1<cdnz; k1++) {
663       ca[k1]        = apa_sparse[k];
664       apa_sparse[k] = 0.0;
665       k++;
666     }
667 
668     /* 2nd off-diagoanl part of C */
669     ca = coa + co->i[i];
670     for (; k0<conz; k0++) {
671       ca[k0]        = apa_sparse[k];
672       apa_sparse[k] = 0.0;
673       k++;
674     }
675   }
676   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
677   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
682 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
683 {
684   PetscErrorCode     ierr;
685   MPI_Comm           comm;
686   PetscMPIInt        size;
687   Mat                Cmpi;
688   Mat_PtAPMPI        *ptap;
689   PetscFreeSpaceList free_space = NULL,current_space=NULL;
690   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
691   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
692   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
693   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
694   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
695   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
696   PetscReal          afill;
697   PetscScalar        *apa;
698   PetscTable         ta;
699 
700   PetscFunctionBegin;
701   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
702   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
703 
704   /* create struct Mat_PtAPMPI and attached it to C later */
705   ierr = PetscNew(&ptap);CHKERRQ(ierr);
706 
707   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
708   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
709 
710   /* get P_loc by taking all local rows of P */
711   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
712 
713   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
714   pi_loc = p_loc->i; pj_loc = p_loc->j;
715   if (size > 1) {
716     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
717     pi_oth = p_oth->i; pj_oth = p_oth->j;
718   } else {
719     p_oth  = NULL;
720     pi_oth = NULL; pj_oth = NULL;
721   }
722 
723   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
724   /*-------------------------------------------------------------------*/
725   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
726   ptap->api = api;
727   api[0]    = 0;
728 
729   /* create and initialize a linked list */
730   apnz_max = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected apnz_max */
731   if (apnz_max > pN) apnz_max = pN;
732   ierr = PetscTableCreate(apnz_max,pN,&ta);CHKERRQ(ierr);
733 
734   /* Calculate apnz_max */
735   apnz_max = 0;
736   for (i=0; i<am; i++) {
737     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
738     /* diagonal portion of A */
739     nzi  = adi[i+1] - adi[i];
740     Jptr = adj+adi[i];  /* cols of A_diag */
741     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
742     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
743     if (apnz_max < apnz) apnz_max = apnz;
744 
745     /*  off-diagonal portion of A */
746     nzi = aoi[i+1] - aoi[i];
747     Jptr = aoj+aoi[i];  /* cols of A_off */
748     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
749     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
750     if (apnz_max < apnz) apnz_max = apnz;
751   }
752   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
753 
754   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
755 
756   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
757   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
758   current_space = free_space;
759   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
760   for (i=0; i<am; i++) {
761     /* diagonal portion of A */
762     nzi = adi[i+1] - adi[i];
763     for (j=0; j<nzi; j++) {
764       row  = *adj++;
765       pnz  = pi_loc[row+1] - pi_loc[row];
766       Jptr = pj_loc + pi_loc[row];
767       /* add non-zero cols of P into the sorted linked list lnk */
768       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
769     }
770     /* off-diagonal portion of A */
771     nzi = aoi[i+1] - aoi[i];
772     for (j=0; j<nzi; j++) {
773       row  = *aoj++;
774       pnz  = pi_oth[row+1] - pi_oth[row];
775       Jptr = pj_oth + pi_oth[row];
776       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
777     }
778 
779     apnz     = *lnk;
780     api[i+1] = api[i] + apnz;
781 
782     /* if free space is not available, double the total space in the list */
783     if (current_space->local_remaining<apnz) {
784       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
785       nspacedouble++;
786     }
787 
788     /* Copy data into free space, then initialize lnk */
789     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
790     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
791 
792     current_space->array           += apnz;
793     current_space->local_used      += apnz;
794     current_space->local_remaining -= apnz;
795   }
796 
797   /* Allocate space for apj, initialize apj, and */
798   /* destroy list of free space and other temporary array(s) */
799   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
800   apj  = ptap->apj;
801   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
802   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
803 
804   /* create and assemble symbolic parallel matrix Cmpi */
805   /*----------------------------------------------------*/
806   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
807   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
808   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
809   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
810   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
811   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
812 
813   /* malloc apa for assembly Cmpi */
814   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
815 
816   ptap->apa = apa;
817   for (i=0; i<am; i++) {
818     row  = i + rstart;
819     apnz = api[i+1] - api[i];
820     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
821     apj += apnz;
822   }
823   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
824   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
825 
826   ptap->destroy             = Cmpi->ops->destroy;
827   ptap->duplicate           = Cmpi->ops->duplicate;
828   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
829   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
830   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
831 
832   /* attach the supporting struct to Cmpi for reuse */
833   c       = (Mat_MPIAIJ*)Cmpi->data;
834   c->ptap = ptap;
835 
836   *C = Cmpi;
837 
838   /* set MatInfo */
839   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
840   if (afill < 1.0) afill = 1.0;
841   Cmpi->info.mallocs           = nspacedouble;
842   Cmpi->info.fill_ratio_given  = fill;
843   Cmpi->info.fill_ratio_needed = afill;
844 
845 #if defined(PETSC_USE_INFO)
846   if (api[am]) {
847     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
848     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
849   } else {
850     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
851   }
852 #endif
853   PetscFunctionReturn(0);
854 }
855 
856 /*-------------------------------------------------------------------------*/
857 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
858 {
859   PetscErrorCode ierr;
860   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
861   PetscInt       alg=0; /* set default algorithm */
862 
863   PetscFunctionBegin;
864   if (scall == MAT_INITIAL_MATRIX) {
865     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
866     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
867     ierr = PetscOptionsEnd();CHKERRQ(ierr);
868 
869     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
870     switch (alg) {
871     case 1:
872       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
873       break;
874     case 2:
875     {
876       Mat         Pt;
877       Mat_PtAPMPI *ptap;
878       Mat_MPIAIJ  *c;
879       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
880       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
881       c        = (Mat_MPIAIJ*)(*C)->data;
882       ptap     = c->ptap;
883       ptap->Pt = Pt;
884       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
885       PetscFunctionReturn(0);
886     }
887       break;
888     default:
889       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
890       break;
891     }
892     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
893   }
894   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
895   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
896   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 
900 /* This routine only works when scall=MAT_REUSE_MATRIX! */
901 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
902 {
903   PetscErrorCode ierr;
904   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
905   Mat_PtAPMPI    *ptap= c->ptap;
906   Mat            Pt=ptap->Pt;
907 
908   PetscFunctionBegin;
909   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
910   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
911   PetscFunctionReturn(0);
912 }
913 
914 /* Non-scalable version, use dense axpy */
915 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
916 {
917   PetscErrorCode      ierr;
918   Mat_Merge_SeqsToMPI *merge;
919   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
920   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
921   Mat_PtAPMPI         *ptap;
922   PetscInt            *adj,*aJ;
923   PetscInt            i,j,k,anz,pnz,row,*cj;
924   MatScalar           *ada,*aval,*ca,valtmp;
925   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
926   MPI_Comm            comm;
927   PetscMPIInt         size,rank,taga,*len_s;
928   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
929   PetscInt            **buf_ri,**buf_rj;
930   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
931   MPI_Request         *s_waits,*r_waits;
932   MPI_Status          *status;
933   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
934   PetscInt            *ai,*aj,*coi,*coj;
935   PetscInt            *poJ,*pdJ;
936   Mat                 A_loc;
937   Mat_SeqAIJ          *a_loc;
938 
939   PetscFunctionBegin;
940   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
941   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
942   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
943 
944   ptap  = c->ptap;
945   merge = ptap->merge;
946 
947   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
948   /*--------------------------------------------------------------*/
949   /* get data from symbolic products */
950   coi  = merge->coi; coj = merge->coj;
951   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
952 
953   bi     = merge->bi; bj = merge->bj;
954   owners = merge->rowmap->range;
955   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
956 
957   /* get A_loc by taking all local rows of A */
958   A_loc = ptap->A_loc;
959   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
960   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
961   ai    = a_loc->i;
962   aj    = a_loc->j;
963 
964   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
965 
966   for (i=0; i<am; i++) {
967     /* 2-a) put A[i,:] to dense array aval */
968     anz = ai[i+1] - ai[i];
969     adj = aj + ai[i];
970     ada = a_loc->a + ai[i];
971     for (j=0; j<anz; j++) {
972       aval[adj[j]] = ada[j];
973     }
974 
975     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
976     /*--------------------------------------------------------------*/
977     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
978     pnz = po->i[i+1] - po->i[i];
979     poJ = po->j + po->i[i];
980     pA  = po->a + po->i[i];
981     for (j=0; j<pnz; j++) {
982       row = poJ[j];
983       cnz = coi[row+1] - coi[row];
984       cj  = coj + coi[row];
985       ca  = coa + coi[row];
986       /* perform dense axpy */
987       valtmp = pA[j];
988       for (k=0; k<cnz; k++) {
989         ca[k] += valtmp*aval[cj[k]];
990       }
991       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
992     }
993 
994     /* put the value into Cd (diagonal part) */
995     pnz = pd->i[i+1] - pd->i[i];
996     pdJ = pd->j + pd->i[i];
997     pA  = pd->a + pd->i[i];
998     for (j=0; j<pnz; j++) {
999       row = pdJ[j];
1000       cnz = bi[row+1] - bi[row];
1001       cj  = bj + bi[row];
1002       ca  = ba + bi[row];
1003       /* perform dense axpy */
1004       valtmp = pA[j];
1005       for (k=0; k<cnz; k++) {
1006         ca[k] += valtmp*aval[cj[k]];
1007       }
1008       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1009     }
1010 
1011     /* zero the current row of Pt*A */
1012     aJ = aj + ai[i];
1013     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1014   }
1015 
1016   /* 3) send and recv matrix values coa */
1017   /*------------------------------------*/
1018   buf_ri = merge->buf_ri;
1019   buf_rj = merge->buf_rj;
1020   len_s  = merge->len_s;
1021   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1022   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1023 
1024   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1025   for (proc=0,k=0; proc<size; proc++) {
1026     if (!len_s[proc]) continue;
1027     i    = merge->owners_co[proc];
1028     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1029     k++;
1030   }
1031   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1032   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1033 
1034   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1035   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1036   ierr = PetscFree(coa);CHKERRQ(ierr);
1037 
1038   /* 4) insert local Cseq and received values into Cmpi */
1039   /*----------------------------------------------------*/
1040   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1041   for (k=0; k<merge->nrecv; k++) {
1042     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1043     nrows       = *(buf_ri_k[k]);
1044     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1045     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1046   }
1047 
1048   for (i=0; i<cm; i++) {
1049     row  = owners[rank] + i; /* global row index of C_seq */
1050     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1051     ba_i = ba + bi[i];
1052     bnz  = bi[i+1] - bi[i];
1053     /* add received vals into ba */
1054     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1055       /* i-th row */
1056       if (i == *nextrow[k]) {
1057         cnz    = *(nextci[k]+1) - *nextci[k];
1058         cj     = buf_rj[k] + *(nextci[k]);
1059         ca     = abuf_r[k] + *(nextci[k]);
1060         nextcj = 0;
1061         for (j=0; nextcj<cnz; j++) {
1062           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1063             ba_i[j] += ca[nextcj++];
1064           }
1065         }
1066         nextrow[k]++; nextci[k]++;
1067         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1068       }
1069     }
1070     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1071   }
1072   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1073   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1074 
1075   ierr = PetscFree(ba);CHKERRQ(ierr);
1076   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1077   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1078   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1079   ierr = PetscFree(aval);CHKERRQ(ierr);
1080   PetscFunctionReturn(0);
1081 }
1082 
1083 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1084 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1085 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1086 {
1087   PetscErrorCode      ierr;
1088   Mat                 Cmpi,A_loc,POt,PDt;
1089   Mat_PtAPMPI         *ptap;
1090   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1091   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1092   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1093   PetscInt            nnz;
1094   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1095   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1096   PetscBT             lnkbt;
1097   MPI_Comm            comm;
1098   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1099   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1100   PetscInt            len,proc,*dnz,*onz,*owners;
1101   PetscInt            nzi,*bi,*bj;
1102   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1103   MPI_Request         *swaits,*rwaits;
1104   MPI_Status          *sstatus,rstatus;
1105   Mat_Merge_SeqsToMPI *merge;
1106   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1107   PetscReal           afill  =1.0,afill_tmp;
1108   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1109   PetscScalar         *vals;
1110   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1111 
1112   PetscFunctionBegin;
1113   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1114   /* check if matrix local sizes are compatible */
1115   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1116 
1117   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1118   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1119 
1120   /* create struct Mat_PtAPMPI and attached it to C later */
1121   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1122 
1123   /* get A_loc by taking all local rows of A */
1124   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1125 
1126   ptap->A_loc = A_loc;
1127 
1128   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1129   ai    = a_loc->i;
1130   aj    = a_loc->j;
1131 
1132   /* determine symbolic Co=(p->B)^T*A - send to others */
1133   /*----------------------------------------------------*/
1134   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1135   pdt  = (Mat_SeqAIJ*)PDt->data;
1136   pdti = pdt->i; pdtj = pdt->j;
1137 
1138   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1139   pot  = (Mat_SeqAIJ*)POt->data;
1140   poti = pot->i; potj = pot->j;
1141 
1142   /* then, compute symbolic Co = (p->B)^T*A */
1143   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1144   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1145   coi[0] = 0;
1146 
1147   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1148   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1149   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1150   current_space = free_space;
1151 
1152   /* create and initialize a linked list */
1153   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1154 
1155   for (i=0; i<pon; i++) {
1156     pnz = poti[i+1] - poti[i];
1157     ptJ = potj + poti[i];
1158     for (j=0; j<pnz; j++) {
1159       row  = ptJ[j]; /* row of A_loc == col of Pot */
1160       anz  = ai[row+1] - ai[row];
1161       Jptr = aj + ai[row];
1162       /* add non-zero cols of AP into the sorted linked list lnk */
1163       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1164     }
1165     nnz = lnk[0];
1166 
1167     /* If free space is not available, double the total space in the list */
1168     if (current_space->local_remaining<nnz) {
1169       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1170       nspacedouble++;
1171     }
1172 
1173     /* Copy data into free space, and zero out denserows */
1174     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1175 
1176     current_space->array           += nnz;
1177     current_space->local_used      += nnz;
1178     current_space->local_remaining -= nnz;
1179 
1180     coi[i+1] = coi[i] + nnz;
1181   }
1182 
1183   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1184   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1185 
1186   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1187   if (afill_tmp > afill) afill = afill_tmp;
1188 
1189   /* send j-array (coj) of Co to other processors */
1190   /*----------------------------------------------*/
1191   /* determine row ownership */
1192   ierr = PetscNew(&merge);CHKERRQ(ierr);
1193   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1194 
1195   merge->rowmap->n  = pn;
1196   merge->rowmap->bs = 1;
1197 
1198   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1199   owners = merge->rowmap->range;
1200 
1201   /* determine the number of messages to send, their lengths */
1202   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1203   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1204 
1205   len_s        = merge->len_s;
1206   merge->nsend = 0;
1207 
1208   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1209   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1210 
1211   proc = 0;
1212   for (i=0; i<pon; i++) {
1213     while (prmap[i] >= owners[proc+1]) proc++;
1214     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1215     len_s[proc] += coi[i+1] - coi[i];
1216   }
1217 
1218   len          = 0; /* max length of buf_si[] */
1219   owners_co[0] = 0;
1220   for (proc=0; proc<size; proc++) {
1221     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1222     if (len_si[proc]) {
1223       merge->nsend++;
1224       len_si[proc] = 2*(len_si[proc] + 1);
1225       len         += len_si[proc];
1226     }
1227   }
1228 
1229   /* determine the number and length of messages to receive for coi and coj  */
1230   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1231   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1232 
1233   /* post the Irecv and Isend of coj */
1234   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1235   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1236   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1237   for (proc=0, k=0; proc<size; proc++) {
1238     if (!len_s[proc]) continue;
1239     i    = owners_co[proc];
1240     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1241     k++;
1242   }
1243 
1244   /* receives and sends of coj are complete */
1245   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1246   for (i=0; i<merge->nrecv; i++) {
1247     PetscMPIInt icompleted;
1248     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1249   }
1250   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1251   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1252 
1253   /* send and recv coi */
1254   /*-------------------*/
1255   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1256   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1257   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1258   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1259   for (proc=0,k=0; proc<size; proc++) {
1260     if (!len_s[proc]) continue;
1261     /* form outgoing message for i-structure:
1262          buf_si[0]:                 nrows to be sent
1263                [1:nrows]:           row index (global)
1264                [nrows+1:2*nrows+1]: i-structure index
1265     */
1266     /*-------------------------------------------*/
1267     nrows       = len_si[proc]/2 - 1;
1268     buf_si_i    = buf_si + nrows+1;
1269     buf_si[0]   = nrows;
1270     buf_si_i[0] = 0;
1271     nrows       = 0;
1272     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1273       nzi               = coi[i+1] - coi[i];
1274       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1275       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1276       nrows++;
1277     }
1278     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1279     k++;
1280     buf_si += len_si[proc];
1281   }
1282   i = merge->nrecv;
1283   while (i--) {
1284     PetscMPIInt icompleted;
1285     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1286   }
1287   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1288   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1289   ierr = PetscFree(len_si);CHKERRQ(ierr);
1290   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1291   ierr = PetscFree(swaits);CHKERRQ(ierr);
1292   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1293   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1294 
1295   /* compute the local portion of C (mpi mat) */
1296   /*------------------------------------------*/
1297   /* allocate bi array and free space for accumulating nonzero column info */
1298   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1299   bi[0] = 0;
1300 
1301   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1302   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1303   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1304   current_space = free_space;
1305 
1306   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1307   for (k=0; k<merge->nrecv; k++) {
1308     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1309     nrows       = *buf_ri_k[k];
1310     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1311     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1312   }
1313 
1314   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1315   rmax = 0;
1316   for (i=0; i<pn; i++) {
1317     /* add pdt[i,:]*AP into lnk */
1318     pnz = pdti[i+1] - pdti[i];
1319     ptJ = pdtj + pdti[i];
1320     for (j=0; j<pnz; j++) {
1321       row  = ptJ[j];  /* row of AP == col of Pt */
1322       anz  = ai[row+1] - ai[row];
1323       Jptr = aj + ai[row];
1324       /* add non-zero cols of AP into the sorted linked list lnk */
1325       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1326     }
1327 
1328     /* add received col data into lnk */
1329     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1330       if (i == *nextrow[k]) { /* i-th row */
1331         nzi  = *(nextci[k]+1) - *nextci[k];
1332         Jptr = buf_rj[k] + *nextci[k];
1333         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1334         nextrow[k]++; nextci[k]++;
1335       }
1336     }
1337     nnz = lnk[0];
1338 
1339     /* if free space is not available, make more free space */
1340     if (current_space->local_remaining<nnz) {
1341       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1342       nspacedouble++;
1343     }
1344     /* copy data into free space, then initialize lnk */
1345     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1346     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1347 
1348     current_space->array           += nnz;
1349     current_space->local_used      += nnz;
1350     current_space->local_remaining -= nnz;
1351 
1352     bi[i+1] = bi[i] + nnz;
1353     if (nnz > rmax) rmax = nnz;
1354   }
1355   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1356 
1357   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1358   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1359 
1360   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1361   if (afill_tmp > afill) afill = afill_tmp;
1362   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1363   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1364   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1365 
1366   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1367   /*----------------------------------------------------------------------------------*/
1368   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1369 
1370   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1371   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1372   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1373   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1374   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1375   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1376   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1377   for (i=0; i<pn; i++) {
1378     row  = i + rstart;
1379     nnz  = bi[i+1] - bi[i];
1380     Jptr = bj + bi[i];
1381     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1382   }
1383   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1384   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1385   ierr = PetscFree(vals);CHKERRQ(ierr);
1386 
1387   merge->bi        = bi;
1388   merge->bj        = bj;
1389   merge->coi       = coi;
1390   merge->coj       = coj;
1391   merge->buf_ri    = buf_ri;
1392   merge->buf_rj    = buf_rj;
1393   merge->owners_co = owners_co;
1394 
1395   /* attach the supporting struct to Cmpi for reuse */
1396   c           = (Mat_MPIAIJ*)Cmpi->data;
1397   c->ptap     = ptap;
1398   ptap->api   = NULL;
1399   ptap->apj   = NULL;
1400   ptap->merge = merge;
1401   ptap->destroy   = Cmpi->ops->destroy;
1402   ptap->duplicate = Cmpi->ops->duplicate;
1403 
1404   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1405   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1406   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1407 
1408   *C = Cmpi;
1409 #if defined(PETSC_USE_INFO)
1410   if (bi[pn] != 0) {
1411     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1412     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1413   } else {
1414     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1415   }
1416 #endif
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1421 {
1422   PetscErrorCode      ierr;
1423   Mat_Merge_SeqsToMPI *merge;
1424   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1425   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1426   Mat_PtAPMPI         *ptap;
1427   PetscInt            *adj;
1428   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1429   MatScalar           *ada,*ca,valtmp;
1430   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1431   MPI_Comm            comm;
1432   PetscMPIInt         size,rank,taga,*len_s;
1433   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1434   PetscInt            **buf_ri,**buf_rj;
1435   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1436   MPI_Request         *s_waits,*r_waits;
1437   MPI_Status          *status;
1438   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1439   PetscInt            *ai,*aj,*coi,*coj;
1440   PetscInt            *poJ,*pdJ;
1441   Mat                 A_loc;
1442   Mat_SeqAIJ          *a_loc;
1443 
1444   PetscFunctionBegin;
1445   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1446   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1447   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1448 
1449   ptap  = c->ptap;
1450   merge = ptap->merge;
1451 
1452   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1453   /*------------------------------------------*/
1454   /* get data from symbolic products */
1455   coi    = merge->coi; coj = merge->coj;
1456   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1457   bi     = merge->bi; bj = merge->bj;
1458   owners = merge->rowmap->range;
1459   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1460 
1461   /* get A_loc by taking all local rows of A */
1462   A_loc = ptap->A_loc;
1463   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1464   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1465   ai    = a_loc->i;
1466   aj    = a_loc->j;
1467 
1468   for (i=0; i<am; i++) {
1469     anz = ai[i+1] - ai[i];
1470     adj = aj + ai[i];
1471     ada = a_loc->a + ai[i];
1472 
1473     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1474     /*-------------------------------------------------------------*/
1475     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1476     pnz = po->i[i+1] - po->i[i];
1477     poJ = po->j + po->i[i];
1478     pA  = po->a + po->i[i];
1479     for (j=0; j<pnz; j++) {
1480       row = poJ[j];
1481       cj  = coj + coi[row];
1482       ca  = coa + coi[row];
1483       /* perform sparse axpy */
1484       nexta  = 0;
1485       valtmp = pA[j];
1486       for (k=0; nexta<anz; k++) {
1487         if (cj[k] == adj[nexta]) {
1488           ca[k] += valtmp*ada[nexta];
1489           nexta++;
1490         }
1491       }
1492       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1493     }
1494 
1495     /* put the value into Cd (diagonal part) */
1496     pnz = pd->i[i+1] - pd->i[i];
1497     pdJ = pd->j + pd->i[i];
1498     pA  = pd->a + pd->i[i];
1499     for (j=0; j<pnz; j++) {
1500       row = pdJ[j];
1501       cj  = bj + bi[row];
1502       ca  = ba + bi[row];
1503       /* perform sparse axpy */
1504       nexta  = 0;
1505       valtmp = pA[j];
1506       for (k=0; nexta<anz; k++) {
1507         if (cj[k] == adj[nexta]) {
1508           ca[k] += valtmp*ada[nexta];
1509           nexta++;
1510         }
1511       }
1512       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1513     }
1514   }
1515 
1516   /* 3) send and recv matrix values coa */
1517   /*------------------------------------*/
1518   buf_ri = merge->buf_ri;
1519   buf_rj = merge->buf_rj;
1520   len_s  = merge->len_s;
1521   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1522   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1523 
1524   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1525   for (proc=0,k=0; proc<size; proc++) {
1526     if (!len_s[proc]) continue;
1527     i    = merge->owners_co[proc];
1528     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1529     k++;
1530   }
1531   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1532   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1533 
1534   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1535   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1536   ierr = PetscFree(coa);CHKERRQ(ierr);
1537 
1538   /* 4) insert local Cseq and received values into Cmpi */
1539   /*----------------------------------------------------*/
1540   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1541   for (k=0; k<merge->nrecv; k++) {
1542     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1543     nrows       = *(buf_ri_k[k]);
1544     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1545     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1546   }
1547 
1548   for (i=0; i<cm; i++) {
1549     row  = owners[rank] + i; /* global row index of C_seq */
1550     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1551     ba_i = ba + bi[i];
1552     bnz  = bi[i+1] - bi[i];
1553     /* add received vals into ba */
1554     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1555       /* i-th row */
1556       if (i == *nextrow[k]) {
1557         cnz    = *(nextci[k]+1) - *nextci[k];
1558         cj     = buf_rj[k] + *(nextci[k]);
1559         ca     = abuf_r[k] + *(nextci[k]);
1560         nextcj = 0;
1561         for (j=0; nextcj<cnz; j++) {
1562           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1563             ba_i[j] += ca[nextcj++];
1564           }
1565         }
1566         nextrow[k]++; nextci[k]++;
1567         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1568       }
1569     }
1570     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1571   }
1572   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1573   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1574 
1575   ierr = PetscFree(ba);CHKERRQ(ierr);
1576   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1577   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1578   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1583 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1584    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1585 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1586 {
1587   PetscErrorCode      ierr;
1588   Mat                 Cmpi,A_loc,POt,PDt;
1589   Mat_PtAPMPI         *ptap;
1590   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1591   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1592   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1593   PetscInt            nnz;
1594   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1595   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1596   MPI_Comm            comm;
1597   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1598   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1599   PetscInt            len,proc,*dnz,*onz,*owners;
1600   PetscInt            nzi,*bi,*bj;
1601   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1602   MPI_Request         *swaits,*rwaits;
1603   MPI_Status          *sstatus,rstatus;
1604   Mat_Merge_SeqsToMPI *merge;
1605   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1606   PetscReal           afill  =1.0,afill_tmp;
1607   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1608   PetscScalar         *vals;
1609   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1610   PetscTable          ta;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1614   /* check if matrix local sizes are compatible */
1615   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1616 
1617   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1618   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1619 
1620   /* create struct Mat_PtAPMPI and attached it to C later */
1621   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1622 
1623   /* get A_loc by taking all local rows of A */
1624   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1625 
1626   ptap->A_loc = A_loc;
1627   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1628   ai          = a_loc->i;
1629   aj          = a_loc->j;
1630 
1631   /* determine symbolic Co=(p->B)^T*A - send to others */
1632   /*----------------------------------------------------*/
1633   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1634   pdt  = (Mat_SeqAIJ*)PDt->data;
1635   pdti = pdt->i; pdtj = pdt->j;
1636 
1637   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1638   pot  = (Mat_SeqAIJ*)POt->data;
1639   poti = pot->i; potj = pot->j;
1640 
1641   /* then, compute symbolic Co = (p->B)^T*A */
1642   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1643                          >= (num of nonzero rows of C_seq) - pn */
1644   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1645   coi[0] = 0;
1646 
1647   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1648   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1649   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1650   current_space = free_space;
1651 
1652   /* create and initialize a linked list */
1653   ierr = PetscTableCreate(aN,aN,&ta);CHKERRQ(ierr);
1654   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1655   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1656   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1657 
1658   for (i=0; i<pon; i++) {
1659     pnz = poti[i+1] - poti[i];
1660     ptJ = potj + poti[i];
1661     for (j=0; j<pnz; j++) {
1662       row  = ptJ[j]; /* row of A_loc == col of Pot */
1663       anz  = ai[row+1] - ai[row];
1664       Jptr = aj + ai[row];
1665       /* add non-zero cols of AP into the sorted linked list lnk */
1666       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1667     }
1668     nnz = lnk[0];
1669 
1670     /* If free space is not available, double the total space in the list */
1671     if (current_space->local_remaining<nnz) {
1672       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1673       nspacedouble++;
1674     }
1675 
1676     /* Copy data into free space, and zero out denserows */
1677     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1678 
1679     current_space->array           += nnz;
1680     current_space->local_used      += nnz;
1681     current_space->local_remaining -= nnz;
1682 
1683     coi[i+1] = coi[i] + nnz;
1684   }
1685 
1686   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1687   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1688   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1689 
1690   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1691   if (afill_tmp > afill) afill = afill_tmp;
1692 
1693   /* send j-array (coj) of Co to other processors */
1694   /*----------------------------------------------*/
1695   /* determine row ownership */
1696   ierr = PetscNew(&merge);CHKERRQ(ierr);
1697   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1698 
1699   merge->rowmap->n  = pn;
1700   merge->rowmap->bs = 1;
1701 
1702   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1703   owners = merge->rowmap->range;
1704 
1705   /* determine the number of messages to send, their lengths */
1706   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1707   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1708 
1709   len_s        = merge->len_s;
1710   merge->nsend = 0;
1711 
1712   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1713   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1714 
1715   proc = 0;
1716   for (i=0; i<pon; i++) {
1717     while (prmap[i] >= owners[proc+1]) proc++;
1718     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1719     len_s[proc] += coi[i+1] - coi[i];
1720   }
1721 
1722   len          = 0; /* max length of buf_si[] */
1723   owners_co[0] = 0;
1724   for (proc=0; proc<size; proc++) {
1725     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1726     if (len_si[proc]) {
1727       merge->nsend++;
1728       len_si[proc] = 2*(len_si[proc] + 1);
1729       len         += len_si[proc];
1730     }
1731   }
1732 
1733   /* determine the number and length of messages to receive for coi and coj  */
1734   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1735   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1736 
1737   /* post the Irecv and Isend of coj */
1738   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1739   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1740   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1741   for (proc=0, k=0; proc<size; proc++) {
1742     if (!len_s[proc]) continue;
1743     i    = owners_co[proc];
1744     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1745     k++;
1746   }
1747 
1748   /* receives and sends of coj are complete */
1749   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1750   for (i=0; i<merge->nrecv; i++) {
1751     PetscMPIInt icompleted;
1752     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1753   }
1754   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1755   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1756 
1757   /* add received column indices into table to update Armax */
1758   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1759     Jptr = buf_rj[k];
1760     for (j=0; j<merge->len_r[k]; j++) {
1761       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1762     }
1763   }
1764   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1765 
1766   /* send and recv coi */
1767   /*-------------------*/
1768   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1769   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1770   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1771   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1772   for (proc=0,k=0; proc<size; proc++) {
1773     if (!len_s[proc]) continue;
1774     /* form outgoing message for i-structure:
1775          buf_si[0]:                 nrows to be sent
1776                [1:nrows]:           row index (global)
1777                [nrows+1:2*nrows+1]: i-structure index
1778     */
1779     /*-------------------------------------------*/
1780     nrows       = len_si[proc]/2 - 1;
1781     buf_si_i    = buf_si + nrows+1;
1782     buf_si[0]   = nrows;
1783     buf_si_i[0] = 0;
1784     nrows       = 0;
1785     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1786       nzi               = coi[i+1] - coi[i];
1787       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1788       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1789       nrows++;
1790     }
1791     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1792     k++;
1793     buf_si += len_si[proc];
1794   }
1795   i = merge->nrecv;
1796   while (i--) {
1797     PetscMPIInt icompleted;
1798     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1799   }
1800   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1801   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1802   ierr = PetscFree(len_si);CHKERRQ(ierr);
1803   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1804   ierr = PetscFree(swaits);CHKERRQ(ierr);
1805   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1806   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1807 
1808   /* compute the local portion of C (mpi mat) */
1809   /*------------------------------------------*/
1810   /* allocate bi array and free space for accumulating nonzero column info */
1811   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1812   bi[0] = 0;
1813 
1814   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1815   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1816   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1817   current_space = free_space;
1818 
1819   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1820   for (k=0; k<merge->nrecv; k++) {
1821     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1822     nrows       = *buf_ri_k[k];
1823     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1824     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1825   }
1826 
1827   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1828   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1829   rmax = 0;
1830   for (i=0; i<pn; i++) {
1831     /* add pdt[i,:]*AP into lnk */
1832     pnz = pdti[i+1] - pdti[i];
1833     ptJ = pdtj + pdti[i];
1834     for (j=0; j<pnz; j++) {
1835       row  = ptJ[j];  /* row of AP == col of Pt */
1836       anz  = ai[row+1] - ai[row];
1837       Jptr = aj + ai[row];
1838       /* add non-zero cols of AP into the sorted linked list lnk */
1839       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1840     }
1841 
1842     /* add received col data into lnk */
1843     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1844       if (i == *nextrow[k]) { /* i-th row */
1845         nzi  = *(nextci[k]+1) - *nextci[k];
1846         Jptr = buf_rj[k] + *nextci[k];
1847         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1848         nextrow[k]++; nextci[k]++;
1849       }
1850     }
1851     nnz = lnk[0];
1852 
1853     /* if free space is not available, make more free space */
1854     if (current_space->local_remaining<nnz) {
1855       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1856       nspacedouble++;
1857     }
1858     /* copy data into free space, then initialize lnk */
1859     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1860     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1861 
1862     current_space->array           += nnz;
1863     current_space->local_used      += nnz;
1864     current_space->local_remaining -= nnz;
1865 
1866     bi[i+1] = bi[i] + nnz;
1867     if (nnz > rmax) rmax = nnz;
1868   }
1869   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1870 
1871   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1872   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1873   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1874   if (afill_tmp > afill) afill = afill_tmp;
1875   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1876   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1877 
1878   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1879   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1880 
1881   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1882   /*----------------------------------------------------------------------------------*/
1883   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1884 
1885   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1886   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1887   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1888   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1889   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1890   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1891   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1892   for (i=0; i<pn; i++) {
1893     row  = i + rstart;
1894     nnz  = bi[i+1] - bi[i];
1895     Jptr = bj + bi[i];
1896     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1897   }
1898   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1899   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1900   ierr = PetscFree(vals);CHKERRQ(ierr);
1901 
1902   merge->bi        = bi;
1903   merge->bj        = bj;
1904   merge->coi       = coi;
1905   merge->coj       = coj;
1906   merge->buf_ri    = buf_ri;
1907   merge->buf_rj    = buf_rj;
1908   merge->owners_co = owners_co;
1909 
1910   /* attach the supporting struct to Cmpi for reuse */
1911   c = (Mat_MPIAIJ*)Cmpi->data;
1912 
1913   c->ptap     = ptap;
1914   ptap->api   = NULL;
1915   ptap->apj   = NULL;
1916   ptap->merge = merge;
1917   ptap->apa   = NULL;
1918   ptap->destroy   = Cmpi->ops->destroy;
1919   ptap->duplicate = Cmpi->ops->duplicate;
1920 
1921   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1922   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1923   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1924 
1925   *C = Cmpi;
1926 #if defined(PETSC_USE_INFO)
1927   if (bi[pn] != 0) {
1928     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1929     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1930   } else {
1931     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1932   }
1933 #endif
1934   PetscFunctionReturn(0);
1935 }
1936