xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 2b8d69ca7ea5fe9190df62c1dce3bbd66fce84dd)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20   MPI_Comm       comm;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
25     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
26 
27     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
28     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsEnd();CHKERRQ(ierr);
30 
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     switch (alg) {
33     case 1:
34       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
35       break;
36     default:
37       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
38       break;
39     }
40     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
41   }
42   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
43   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
44   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 
48 #undef __FUNCT__
49 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
50 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51 {
52   PetscErrorCode ierr;
53   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
54   Mat_PtAPMPI    *ptap = a->ptap;
55 
56   PetscFunctionBegin;
57   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
59   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
60   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
61   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
62   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
63   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
64   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
65   ierr = ptap->destroy(A);CHKERRQ(ierr);
66   ierr = PetscFree(ptap);CHKERRQ(ierr);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
72 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73 {
74   PetscErrorCode ierr;
75   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
76   Mat_PtAPMPI    *ptap = a->ptap;
77 
78   PetscFunctionBegin;
79   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
80 
81   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83   PetscFunctionReturn(0);
84 }
85 
86 #undef __FUNCT__
87 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
88 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89 {
90   PetscErrorCode ierr;
91   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94   PetscScalar    *cda=cd->a,*coa=co->a;
95   Mat_SeqAIJ     *p_loc,*p_oth;
96   PetscScalar    *apa,*ca;
97   PetscInt       cm   =C->rmap->n;
98   Mat_PtAPMPI    *ptap=c->ptap;
99   PetscInt       *api,*apj,*apJ,i,k;
100   PetscInt       cstart=C->cmap->rstart;
101   PetscInt       cdnz,conz,k0,k1;
102   MPI_Comm       comm;
103   PetscMPIInt    size;
104 
105   PetscFunctionBegin;
106   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
107   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
108 
109   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
110   /*-----------------------------------------------------*/
111   /* update numerical values of P_oth and P_loc */
112   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
113   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
114 
115   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116   /*----------------------------------------------------------*/
117   /* get data from symbolic products */
118   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119   p_oth = NULL;
120   if (size >1) {
121     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122   }
123 
124   /* get apa for storing dense row A[i,:]*P */
125   apa = ptap->apa;
126 
127   api = ptap->api;
128   apj = ptap->apj;
129   for (i=0; i<cm; i++) {
130     /* compute apa = A[i,:]*P */
131     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
132 
133     /* set values in C */
134     apJ  = apj + api[i];
135     cdnz = cd->i[i+1] - cd->i[i];
136     conz = co->i[i+1] - co->i[i];
137 
138     /* 1st off-diagoanl part of C */
139     ca = coa + co->i[i];
140     k  = 0;
141     for (k0=0; k0<conz; k0++) {
142       if (apJ[k] >= cstart) break;
143       ca[k0]      = apa[apJ[k]];
144       apa[apJ[k]] = 0.0;
145       k++;
146     }
147 
148     /* diagonal part of C */
149     ca = cda + cd->i[i];
150     for (k1=0; k1<cdnz; k1++) {
151       ca[k1]      = apa[apJ[k]];
152       apa[apJ[k]] = 0.0;
153       k++;
154     }
155 
156     /* 2nd off-diagoanl part of C */
157     ca = coa + co->i[i];
158     for (; k0<conz; k0++) {
159       ca[k0]      = apa[apJ[k]];
160       apa[apJ[k]] = 0.0;
161       k++;
162     }
163   }
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
171 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172 {
173   PetscErrorCode     ierr;
174   MPI_Comm           comm;
175   PetscMPIInt        size;
176   Mat                Cmpi;
177   Mat_PtAPMPI        *ptap;
178   PetscFreeSpaceList free_space=NULL,current_space=NULL;
179   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
180   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185   PetscBT            lnkbt;
186   PetscScalar        *apa;
187   PetscReal          afill;
188   PetscTable         ta;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
192   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
193 
194   /* create struct Mat_PtAPMPI and attached it to C later */
195   ierr = PetscNew(&ptap);CHKERRQ(ierr);
196 
197   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
199 
200   /* get P_loc by taking all local rows of P */
201   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
202 
203   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204   pi_loc = p_loc->i; pj_loc = p_loc->j;
205   if (size > 1) {
206     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207     pi_oth = p_oth->i; pj_oth = p_oth->j;
208   } else {
209     p_oth = NULL;
210     pi_oth = NULL; pj_oth = NULL;
211   }
212 
213   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214   /*-------------------------------------------------------------------*/
215   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
216   ptap->api = api;
217   api[0]    = 0;
218 
219   /* create and initialize a linked list */
220   Crmax = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN));
221   if (Crmax > pN) Crmax = pN;
222   ierr = PetscTableCreate(Crmax,pN,&ta);CHKERRQ(ierr);
223   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
224   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
225   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
226   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
227 
228   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
229 
230   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
231   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
232   current_space = free_space;
233 
234   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
235   for (i=0; i<am; i++) {
236     /* diagonal portion of A */
237     nzi = adi[i+1] - adi[i];
238     for (j=0; j<nzi; j++) {
239       row  = *adj++;
240       pnz  = pi_loc[row+1] - pi_loc[row];
241       Jptr = pj_loc + pi_loc[row];
242       /* add non-zero cols of P into the sorted linked list lnk */
243       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
244     }
245     /* off-diagonal portion of A */
246     nzi = aoi[i+1] - aoi[i];
247     for (j=0; j<nzi; j++) {
248       row  = *aoj++;
249       pnz  = pi_oth[row+1] - pi_oth[row];
250       Jptr = pj_oth + pi_oth[row];
251       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
252     }
253 
254     apnz     = lnk[0];
255     api[i+1] = api[i] + apnz;
256 
257     /* if free space is not available, double the total space in the list */
258     if (current_space->local_remaining<apnz) {
259       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
260       nspacedouble++;
261     }
262 
263     /* Copy data into free space, then initialize lnk */
264     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
265     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
266 
267     current_space->array           += apnz;
268     current_space->local_used      += apnz;
269     current_space->local_remaining -= apnz;
270   }
271 
272   /* Allocate space for apj, initialize apj, and */
273   /* destroy list of free space and other temporary array(s) */
274   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
275   apj  = ptap->apj;
276   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
277   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
278 
279   /* malloc apa to store dense row A[i,:]*P */
280   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
281 
282   ptap->apa = apa;
283 
284   /* create and assemble symbolic parallel matrix Cmpi */
285   /*----------------------------------------------------*/
286   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
287   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
288   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
289 
290   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
291   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
292   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
293   for (i=0; i<am; i++) {
294     row  = i + rstart;
295     apnz = api[i+1] - api[i];
296     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
297     apj += apnz;
298   }
299   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
300   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301 
302   ptap->destroy        = Cmpi->ops->destroy;
303   ptap->duplicate      = Cmpi->ops->duplicate;
304   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
305   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
306   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
307 
308   /* attach the supporting struct to Cmpi for reuse */
309   c       = (Mat_MPIAIJ*)Cmpi->data;
310   c->ptap = ptap;
311 
312   *C = Cmpi;
313 
314   /* set MatInfo */
315   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
316   if (afill < 1.0) afill = 1.0;
317   Cmpi->info.mallocs           = nspacedouble;
318   Cmpi->info.fill_ratio_given  = fill;
319   Cmpi->info.fill_ratio_needed = afill;
320 
321 #if defined(PETSC_USE_INFO)
322   if (api[am]) {
323     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
324     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
325   } else {
326     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
327   }
328 #endif
329   PetscFunctionReturn(0);
330 }
331 
332 #undef __FUNCT__
333 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
334 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
335 {
336   PetscErrorCode ierr;
337 
338   PetscFunctionBegin;
339   if (scall == MAT_INITIAL_MATRIX) {
340     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
341     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
342     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
343   }
344   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
345   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
346   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 typedef struct {
351   Mat         workB;
352   PetscScalar *rvalues,*svalues;
353   MPI_Request *rwaits,*swaits;
354 } MPIAIJ_MPIDense;
355 
356 #undef __FUNCT__
357 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
358 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
359 {
360   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
361   PetscErrorCode  ierr;
362 
363   PetscFunctionBegin;
364   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
365   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
366   ierr = PetscFree(contents);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
372 /*
373     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
374   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
375 
376   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
377 */
378 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
379 {
380   PetscErrorCode         ierr;
381   PetscBool              flg;
382   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
383   PetscInt               nz   = aij->B->cmap->n;
384   PetscContainer         container;
385   MPIAIJ_MPIDense        *contents;
386   VecScatter             ctx   = aij->Mvctx;
387   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
388   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
389 
390   PetscFunctionBegin;
391   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
392   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
393 
394   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
395   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
396   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
397 
398   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
399 
400   ierr = PetscNew(&contents);CHKERRQ(ierr);
401   /* Create work matrix used to store off processor rows of B needed for local product */
402   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
403   /* Create work arrays needed */
404   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
405                       B->cmap->N*to->starts[to->n],&contents->svalues,
406                       from->n,&contents->rwaits,
407                       to->n,&contents->swaits);CHKERRQ(ierr);
408 
409   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
410   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
411   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
412   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
413   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
414 
415   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNCT__
420 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
421 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
422 {
423   PetscErrorCode         ierr;
424   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
425   PetscInt               nz   = aij->B->cmap->n;
426   PetscContainer         container;
427   MPIAIJ_MPIDense        *contents;
428   VecScatter             ctx   = aij->Mvctx;
429   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
431   PetscInt               m     = A->rmap->n,n=B->cmap->n;
432 
433   PetscFunctionBegin;
434   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
435   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
436   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
437   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
438   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
439   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
440   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
441 
442   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
443 
444   ierr = PetscNew(&contents);CHKERRQ(ierr);
445   /* Create work matrix used to store off processor rows of B needed for local product */
446   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
447   /* Create work arrays needed */
448   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
449                       B->cmap->N*to->starts[to->n],&contents->svalues,
450                       from->n,&contents->rwaits,
451                       to->n,&contents->swaits);CHKERRQ(ierr);
452 
453   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
454   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
455   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
456   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
457   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMPIDenseScatter"
463 /*
464     Performs an efficient scatter on the rows of B needed by this process; this is
465     a modification of the VecScatterBegin_() routines.
466 */
467 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
468 {
469   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
470   PetscErrorCode         ierr;
471   PetscScalar            *b,*w,*svalues,*rvalues;
472   VecScatter             ctx   = aij->Mvctx;
473   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
474   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
475   PetscInt               i,j,k;
476   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
477   PetscMPIInt            *sprocs,*rprocs,nrecvs;
478   MPI_Request            *swaits,*rwaits;
479   MPI_Comm               comm;
480   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
481   MPI_Status             status;
482   MPIAIJ_MPIDense        *contents;
483   PetscContainer         container;
484   Mat                    workB;
485 
486   PetscFunctionBegin;
487   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
488   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
489   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
490   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
491 
492   workB = *outworkB = contents->workB;
493   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
494   sindices = to->indices;
495   sstarts  = to->starts;
496   sprocs   = to->procs;
497   swaits   = contents->swaits;
498   svalues  = contents->svalues;
499 
500   rindices = from->indices;
501   rstarts  = from->starts;
502   rprocs   = from->procs;
503   rwaits   = contents->rwaits;
504   rvalues  = contents->rvalues;
505 
506   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
507   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
508 
509   for (i=0; i<from->n; i++) {
510     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
511   }
512 
513   for (i=0; i<to->n; i++) {
514     /* pack a message at a time */
515     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
516       for (k=0; k<ncols; k++) {
517         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
518       }
519     }
520     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
521   }
522 
523   nrecvs = from->n;
524   while (nrecvs) {
525     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
526     nrecvs--;
527     /* unpack a message at a time */
528     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
529       for (k=0; k<ncols; k++) {
530         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
531       }
532     }
533   }
534   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
535 
536   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
537   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
538   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
539   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
540   PetscFunctionReturn(0);
541 }
542 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
543 
544 #undef __FUNCT__
545 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
546 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
547 {
548   PetscErrorCode ierr;
549   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
550   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
551   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
552   Mat            workB;
553 
554   PetscFunctionBegin;
555   /* diagonal block of A times all local rows of B*/
556   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
557 
558   /* get off processor parts of B needed to complete the product */
559   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
560 
561   /* off-diagonal block of A times nonlocal rows of B */
562   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
563   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
564   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
565   PetscFunctionReturn(0);
566 }
567 
568 #undef __FUNCT__
569 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
570 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
571 {
572   PetscErrorCode ierr;
573   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
574   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
575   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
576   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
577   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
578   Mat_SeqAIJ     *p_loc,*p_oth;
579   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
580   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
581   PetscInt       cm          = C->rmap->n,anz,pnz;
582   Mat_PtAPMPI    *ptap       = c->ptap;
583   PetscScalar    *apa_sparse = ptap->apa;
584   PetscInt       *api,*apj,*apJ,i,j,k,row;
585   PetscInt       cstart = C->cmap->rstart;
586   PetscInt       cdnz,conz,k0,k1,nextp;
587   MPI_Comm       comm;
588   PetscMPIInt    size;
589 
590   PetscFunctionBegin;
591   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
592   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
593 
594   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
595   /*-----------------------------------------------------*/
596   /* update numerical values of P_oth and P_loc */
597   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
598   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
599 
600   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
601   /*----------------------------------------------------------*/
602   /* get data from symbolic products */
603   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
604   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
605   if (size >1) {
606     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
607     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
608   } else {
609     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
610   }
611 
612   api = ptap->api;
613   apj = ptap->apj;
614   for (i=0; i<cm; i++) {
615     apJ = apj + api[i];
616 
617     /* diagonal portion of A */
618     anz = adi[i+1] - adi[i];
619     adj = ad->j + adi[i];
620     ada = ad->a + adi[i];
621     for (j=0; j<anz; j++) {
622       row = adj[j];
623       pnz = pi_loc[row+1] - pi_loc[row];
624       pj  = pj_loc + pi_loc[row];
625       pa  = pa_loc + pi_loc[row];
626       /* perform sparse axpy */
627       valtmp = ada[j];
628       nextp  = 0;
629       for (k=0; nextp<pnz; k++) {
630         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
631           apa_sparse[k] += valtmp*pa[nextp++];
632         }
633       }
634       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
635     }
636 
637     /* off-diagonal portion of A */
638     anz = aoi[i+1] - aoi[i];
639     aoj = ao->j + aoi[i];
640     aoa = ao->a + aoi[i];
641     for (j=0; j<anz; j++) {
642       row = aoj[j];
643       pnz = pi_oth[row+1] - pi_oth[row];
644       pj  = pj_oth + pi_oth[row];
645       pa  = pa_oth + pi_oth[row];
646       /* perform sparse axpy */
647       valtmp = aoa[j];
648       nextp  = 0;
649       for (k=0; nextp<pnz; k++) {
650         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
651           apa_sparse[k] += valtmp*pa[nextp++];
652         }
653       }
654       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
655     }
656 
657     /* set values in C */
658     cdnz = cd->i[i+1] - cd->i[i];
659     conz = co->i[i+1] - co->i[i];
660 
661     /* 1st off-diagoanl part of C */
662     ca = coa + co->i[i];
663     k  = 0;
664     for (k0=0; k0<conz; k0++) {
665       if (apJ[k] >= cstart) break;
666       ca[k0]        = apa_sparse[k];
667       apa_sparse[k] = 0.0;
668       k++;
669     }
670 
671     /* diagonal part of C */
672     ca = cda + cd->i[i];
673     for (k1=0; k1<cdnz; k1++) {
674       ca[k1]        = apa_sparse[k];
675       apa_sparse[k] = 0.0;
676       k++;
677     }
678 
679     /* 2nd off-diagoanl part of C */
680     ca = coa + co->i[i];
681     for (; k0<conz; k0++) {
682       ca[k0]        = apa_sparse[k];
683       apa_sparse[k] = 0.0;
684       k++;
685     }
686   }
687   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
688   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
689   PetscFunctionReturn(0);
690 }
691 
692 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
693 #undef __FUNCT__
694 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
695 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
696 {
697   PetscErrorCode     ierr;
698   MPI_Comm           comm;
699   PetscMPIInt        size;
700   Mat                Cmpi;
701   Mat_PtAPMPI        *ptap;
702   PetscFreeSpaceList free_space = NULL,current_space=NULL;
703   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
704   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
705   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
706   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
707   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
708   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
709   PetscReal          afill;
710   PetscScalar        *apa;
711   PetscTable         ta;
712 
713   PetscFunctionBegin;
714   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
715   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
716 
717   /* create struct Mat_PtAPMPI and attached it to C later */
718   ierr = PetscNew(&ptap);CHKERRQ(ierr);
719 
720   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
721   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
722 
723   /* get P_loc by taking all local rows of P */
724   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
725 
726   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
727   pi_loc = p_loc->i; pj_loc = p_loc->j;
728   if (size > 1) {
729     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
730     pi_oth = p_oth->i; pj_oth = p_oth->j;
731   } else {
732     p_oth  = NULL;
733     pi_oth = NULL; pj_oth = NULL;
734   }
735 
736   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
737   /*-------------------------------------------------------------------*/
738   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
739   ptap->api = api;
740   api[0]    = 0;
741 
742   /* create and initialize a linked list */
743   apnz_max = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected apnz_max */
744   if (apnz_max > pN) apnz_max = pN;
745   ierr = PetscTableCreate(apnz_max,pN,&ta);CHKERRQ(ierr);
746 
747   /* Calculate apnz_max */
748   apnz_max = 0;
749   for (i=0; i<am; i++) {
750     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
751     /* diagonal portion of A */
752     nzi  = adi[i+1] - adi[i];
753     Jptr = adj+adi[i];  /* cols of A_diag */
754     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
755     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
756     if (apnz_max < apnz) apnz_max = apnz;
757 
758     /*  off-diagonal portion of A */
759     nzi = aoi[i+1] - aoi[i];
760     Jptr = aoj+aoi[i];  /* cols of A_off */
761     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
762     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
763     if (apnz_max < apnz) apnz_max = apnz;
764   }
765   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
766 
767   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
768 
769   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
770   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
771   current_space = free_space;
772   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
773   for (i=0; i<am; i++) {
774     /* diagonal portion of A */
775     nzi = adi[i+1] - adi[i];
776     for (j=0; j<nzi; j++) {
777       row  = *adj++;
778       pnz  = pi_loc[row+1] - pi_loc[row];
779       Jptr = pj_loc + pi_loc[row];
780       /* add non-zero cols of P into the sorted linked list lnk */
781       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
782     }
783     /* off-diagonal portion of A */
784     nzi = aoi[i+1] - aoi[i];
785     for (j=0; j<nzi; j++) {
786       row  = *aoj++;
787       pnz  = pi_oth[row+1] - pi_oth[row];
788       Jptr = pj_oth + pi_oth[row];
789       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
790     }
791 
792     apnz     = *lnk;
793     api[i+1] = api[i] + apnz;
794 
795     /* if free space is not available, double the total space in the list */
796     if (current_space->local_remaining<apnz) {
797       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
798       nspacedouble++;
799     }
800 
801     /* Copy data into free space, then initialize lnk */
802     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
803     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
804 
805     current_space->array           += apnz;
806     current_space->local_used      += apnz;
807     current_space->local_remaining -= apnz;
808   }
809 
810   /* Allocate space for apj, initialize apj, and */
811   /* destroy list of free space and other temporary array(s) */
812   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
813   apj  = ptap->apj;
814   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
815   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
816 
817   /* create and assemble symbolic parallel matrix Cmpi */
818   /*----------------------------------------------------*/
819   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
820   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
821   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
822   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
823   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
824   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
825 
826   /* malloc apa for assembly Cmpi */
827   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
828 
829   ptap->apa = apa;
830   for (i=0; i<am; i++) {
831     row  = i + rstart;
832     apnz = api[i+1] - api[i];
833     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
834     apj += apnz;
835   }
836   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
837   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
838 
839   ptap->destroy             = Cmpi->ops->destroy;
840   ptap->duplicate           = Cmpi->ops->duplicate;
841   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
842   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
843   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
844 
845   /* attach the supporting struct to Cmpi for reuse */
846   c       = (Mat_MPIAIJ*)Cmpi->data;
847   c->ptap = ptap;
848 
849   *C = Cmpi;
850 
851   /* set MatInfo */
852   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
853   if (afill < 1.0) afill = 1.0;
854   Cmpi->info.mallocs           = nspacedouble;
855   Cmpi->info.fill_ratio_given  = fill;
856   Cmpi->info.fill_ratio_needed = afill;
857 
858 #if defined(PETSC_USE_INFO)
859   if (api[am]) {
860     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
861     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
862   } else {
863     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
864   }
865 #endif
866   PetscFunctionReturn(0);
867 }
868 
869 /*-------------------------------------------------------------------------*/
870 #undef __FUNCT__
871 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
872 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
873 {
874   PetscErrorCode ierr;
875   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
876   PetscInt       alg=0; /* set default algorithm */
877 
878   PetscFunctionBegin;
879   if (scall == MAT_INITIAL_MATRIX) {
880     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
881     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
882     ierr = PetscOptionsEnd();CHKERRQ(ierr);
883 
884     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
885     switch (alg) {
886     case 1:
887       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
888       break;
889     case 2:
890     {
891       Mat         Pt;
892       Mat_PtAPMPI *ptap;
893       Mat_MPIAIJ  *c;
894       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
895       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
896       c        = (Mat_MPIAIJ*)(*C)->data;
897       ptap     = c->ptap;
898       ptap->Pt = Pt;
899       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
900       PetscFunctionReturn(0);
901     }
902       break;
903     default:
904       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
905       break;
906     }
907     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
908   }
909   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
910   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
911   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
912   PetscFunctionReturn(0);
913 }
914 
915 /* This routine only works when scall=MAT_REUSE_MATRIX! */
916 #undef __FUNCT__
917 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
918 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
919 {
920   PetscErrorCode ierr;
921   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
922   Mat_PtAPMPI    *ptap= c->ptap;
923   Mat            Pt=ptap->Pt;
924 
925   PetscFunctionBegin;
926   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
927   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 /* Non-scalable version, use dense axpy */
932 #undef __FUNCT__
933 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
934 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
935 {
936   PetscErrorCode      ierr;
937   Mat_Merge_SeqsToMPI *merge;
938   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
939   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
940   Mat_PtAPMPI         *ptap;
941   PetscInt            *adj,*aJ;
942   PetscInt            i,j,k,anz,pnz,row,*cj;
943   MatScalar           *ada,*aval,*ca,valtmp;
944   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
945   MPI_Comm            comm;
946   PetscMPIInt         size,rank,taga,*len_s;
947   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
948   PetscInt            **buf_ri,**buf_rj;
949   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
950   MPI_Request         *s_waits,*r_waits;
951   MPI_Status          *status;
952   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
953   PetscInt            *ai,*aj,*coi,*coj;
954   PetscInt            *poJ,*pdJ;
955   Mat                 A_loc;
956   Mat_SeqAIJ          *a_loc;
957 
958   PetscFunctionBegin;
959   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
960   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
961   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
962 
963   ptap  = c->ptap;
964   merge = ptap->merge;
965 
966   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
967   /*--------------------------------------------------------------*/
968   /* get data from symbolic products */
969   coi  = merge->coi; coj = merge->coj;
970   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
971 
972   bi     = merge->bi; bj = merge->bj;
973   owners = merge->rowmap->range;
974   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
975 
976   /* get A_loc by taking all local rows of A */
977   A_loc = ptap->A_loc;
978   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
979   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
980   ai    = a_loc->i;
981   aj    = a_loc->j;
982 
983   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
984 
985   for (i=0; i<am; i++) {
986     /* 2-a) put A[i,:] to dense array aval */
987     anz = ai[i+1] - ai[i];
988     adj = aj + ai[i];
989     ada = a_loc->a + ai[i];
990     for (j=0; j<anz; j++) {
991       aval[adj[j]] = ada[j];
992     }
993 
994     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
995     /*--------------------------------------------------------------*/
996     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
997     pnz = po->i[i+1] - po->i[i];
998     poJ = po->j + po->i[i];
999     pA  = po->a + po->i[i];
1000     for (j=0; j<pnz; j++) {
1001       row = poJ[j];
1002       cnz = coi[row+1] - coi[row];
1003       cj  = coj + coi[row];
1004       ca  = coa + coi[row];
1005       /* perform dense axpy */
1006       valtmp = pA[j];
1007       for (k=0; k<cnz; k++) {
1008         ca[k] += valtmp*aval[cj[k]];
1009       }
1010       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1011     }
1012 
1013     /* put the value into Cd (diagonal part) */
1014     pnz = pd->i[i+1] - pd->i[i];
1015     pdJ = pd->j + pd->i[i];
1016     pA  = pd->a + pd->i[i];
1017     for (j=0; j<pnz; j++) {
1018       row = pdJ[j];
1019       cnz = bi[row+1] - bi[row];
1020       cj  = bj + bi[row];
1021       ca  = ba + bi[row];
1022       /* perform dense axpy */
1023       valtmp = pA[j];
1024       for (k=0; k<cnz; k++) {
1025         ca[k] += valtmp*aval[cj[k]];
1026       }
1027       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1028     }
1029 
1030     /* zero the current row of Pt*A */
1031     aJ = aj + ai[i];
1032     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1033   }
1034 
1035   /* 3) send and recv matrix values coa */
1036   /*------------------------------------*/
1037   buf_ri = merge->buf_ri;
1038   buf_rj = merge->buf_rj;
1039   len_s  = merge->len_s;
1040   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1041   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1042 
1043   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1044   for (proc=0,k=0; proc<size; proc++) {
1045     if (!len_s[proc]) continue;
1046     i    = merge->owners_co[proc];
1047     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1048     k++;
1049   }
1050   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1051   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1052 
1053   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1054   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1055   ierr = PetscFree(coa);CHKERRQ(ierr);
1056 
1057   /* 4) insert local Cseq and received values into Cmpi */
1058   /*----------------------------------------------------*/
1059   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1060   for (k=0; k<merge->nrecv; k++) {
1061     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1062     nrows       = *(buf_ri_k[k]);
1063     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1064     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1065   }
1066 
1067   for (i=0; i<cm; i++) {
1068     row  = owners[rank] + i; /* global row index of C_seq */
1069     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1070     ba_i = ba + bi[i];
1071     bnz  = bi[i+1] - bi[i];
1072     /* add received vals into ba */
1073     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1074       /* i-th row */
1075       if (i == *nextrow[k]) {
1076         cnz    = *(nextci[k]+1) - *nextci[k];
1077         cj     = buf_rj[k] + *(nextci[k]);
1078         ca     = abuf_r[k] + *(nextci[k]);
1079         nextcj = 0;
1080         for (j=0; nextcj<cnz; j++) {
1081           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1082             ba_i[j] += ca[nextcj++];
1083           }
1084         }
1085         nextrow[k]++; nextci[k]++;
1086         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1087       }
1088     }
1089     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1090   }
1091   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1092   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1093 
1094   ierr = PetscFree(ba);CHKERRQ(ierr);
1095   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1096   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1097   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1098   ierr = PetscFree(aval);CHKERRQ(ierr);
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1103 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1104 #undef __FUNCT__
1105 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1106 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1107 {
1108   PetscErrorCode      ierr;
1109   Mat                 Cmpi,A_loc,POt,PDt;
1110   Mat_PtAPMPI         *ptap;
1111   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1112   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1113   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1114   PetscInt            nnz;
1115   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1116   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1117   PetscBT             lnkbt;
1118   MPI_Comm            comm;
1119   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1120   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1121   PetscInt            len,proc,*dnz,*onz,*owners;
1122   PetscInt            nzi,*bi,*bj;
1123   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1124   MPI_Request         *swaits,*rwaits;
1125   MPI_Status          *sstatus,rstatus;
1126   Mat_Merge_SeqsToMPI *merge;
1127   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1128   PetscReal           afill  =1.0,afill_tmp;
1129   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1130   PetscScalar         *vals;
1131   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1132 
1133   PetscFunctionBegin;
1134   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1135   /* check if matrix local sizes are compatible */
1136   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1137 
1138   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1139   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1140 
1141   /* create struct Mat_PtAPMPI and attached it to C later */
1142   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1143 
1144   /* get A_loc by taking all local rows of A */
1145   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1146 
1147   ptap->A_loc = A_loc;
1148 
1149   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1150   ai    = a_loc->i;
1151   aj    = a_loc->j;
1152 
1153   /* determine symbolic Co=(p->B)^T*A - send to others */
1154   /*----------------------------------------------------*/
1155   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1156   pdt  = (Mat_SeqAIJ*)PDt->data;
1157   pdti = pdt->i; pdtj = pdt->j;
1158 
1159   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1160   pot  = (Mat_SeqAIJ*)POt->data;
1161   poti = pot->i; potj = pot->j;
1162 
1163   /* then, compute symbolic Co = (p->B)^T*A */
1164   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1165   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1166   coi[0] = 0;
1167 
1168   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1169   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1170   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1171   current_space = free_space;
1172 
1173   /* create and initialize a linked list */
1174   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1175 
1176   for (i=0; i<pon; i++) {
1177     pnz = poti[i+1] - poti[i];
1178     ptJ = potj + poti[i];
1179     for (j=0; j<pnz; j++) {
1180       row  = ptJ[j]; /* row of A_loc == col of Pot */
1181       anz  = ai[row+1] - ai[row];
1182       Jptr = aj + ai[row];
1183       /* add non-zero cols of AP into the sorted linked list lnk */
1184       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1185     }
1186     nnz = lnk[0];
1187 
1188     /* If free space is not available, double the total space in the list */
1189     if (current_space->local_remaining<nnz) {
1190       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1191       nspacedouble++;
1192     }
1193 
1194     /* Copy data into free space, and zero out denserows */
1195     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1196 
1197     current_space->array           += nnz;
1198     current_space->local_used      += nnz;
1199     current_space->local_remaining -= nnz;
1200 
1201     coi[i+1] = coi[i] + nnz;
1202   }
1203 
1204   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1205   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1206 
1207   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1208   if (afill_tmp > afill) afill = afill_tmp;
1209 
1210   /* send j-array (coj) of Co to other processors */
1211   /*----------------------------------------------*/
1212   /* determine row ownership */
1213   ierr = PetscNew(&merge);CHKERRQ(ierr);
1214   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1215 
1216   merge->rowmap->n  = pn;
1217   merge->rowmap->bs = 1;
1218 
1219   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1220   owners = merge->rowmap->range;
1221 
1222   /* determine the number of messages to send, their lengths */
1223   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1224   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1225 
1226   len_s        = merge->len_s;
1227   merge->nsend = 0;
1228 
1229   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1230   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1231 
1232   proc = 0;
1233   for (i=0; i<pon; i++) {
1234     while (prmap[i] >= owners[proc+1]) proc++;
1235     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1236     len_s[proc] += coi[i+1] - coi[i];
1237   }
1238 
1239   len          = 0; /* max length of buf_si[] */
1240   owners_co[0] = 0;
1241   for (proc=0; proc<size; proc++) {
1242     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1243     if (len_si[proc]) {
1244       merge->nsend++;
1245       len_si[proc] = 2*(len_si[proc] + 1);
1246       len         += len_si[proc];
1247     }
1248   }
1249 
1250   /* determine the number and length of messages to receive for coi and coj  */
1251   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1252   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1253 
1254   /* post the Irecv and Isend of coj */
1255   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1256   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1257   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1258   for (proc=0, k=0; proc<size; proc++) {
1259     if (!len_s[proc]) continue;
1260     i    = owners_co[proc];
1261     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1262     k++;
1263   }
1264 
1265   /* receives and sends of coj are complete */
1266   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1267   for (i=0; i<merge->nrecv; i++) {
1268     PetscMPIInt icompleted;
1269     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1270   }
1271   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1272   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1273 
1274   /* send and recv coi */
1275   /*-------------------*/
1276   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1277   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1278   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1279   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1280   for (proc=0,k=0; proc<size; proc++) {
1281     if (!len_s[proc]) continue;
1282     /* form outgoing message for i-structure:
1283          buf_si[0]:                 nrows to be sent
1284                [1:nrows]:           row index (global)
1285                [nrows+1:2*nrows+1]: i-structure index
1286     */
1287     /*-------------------------------------------*/
1288     nrows       = len_si[proc]/2 - 1;
1289     buf_si_i    = buf_si + nrows+1;
1290     buf_si[0]   = nrows;
1291     buf_si_i[0] = 0;
1292     nrows       = 0;
1293     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1294       nzi               = coi[i+1] - coi[i];
1295       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1296       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1297       nrows++;
1298     }
1299     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1300     k++;
1301     buf_si += len_si[proc];
1302   }
1303   i = merge->nrecv;
1304   while (i--) {
1305     PetscMPIInt icompleted;
1306     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1307   }
1308   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1309   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1310   ierr = PetscFree(len_si);CHKERRQ(ierr);
1311   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1312   ierr = PetscFree(swaits);CHKERRQ(ierr);
1313   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1314   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1315 
1316   /* compute the local portion of C (mpi mat) */
1317   /*------------------------------------------*/
1318   /* allocate bi array and free space for accumulating nonzero column info */
1319   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1320   bi[0] = 0;
1321 
1322   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1323   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1324   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1325   current_space = free_space;
1326 
1327   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1328   for (k=0; k<merge->nrecv; k++) {
1329     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1330     nrows       = *buf_ri_k[k];
1331     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1332     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1333   }
1334 
1335   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1336   rmax = 0;
1337   for (i=0; i<pn; i++) {
1338     /* add pdt[i,:]*AP into lnk */
1339     pnz = pdti[i+1] - pdti[i];
1340     ptJ = pdtj + pdti[i];
1341     for (j=0; j<pnz; j++) {
1342       row  = ptJ[j];  /* row of AP == col of Pt */
1343       anz  = ai[row+1] - ai[row];
1344       Jptr = aj + ai[row];
1345       /* add non-zero cols of AP into the sorted linked list lnk */
1346       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1347     }
1348 
1349     /* add received col data into lnk */
1350     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1351       if (i == *nextrow[k]) { /* i-th row */
1352         nzi  = *(nextci[k]+1) - *nextci[k];
1353         Jptr = buf_rj[k] + *nextci[k];
1354         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1355         nextrow[k]++; nextci[k]++;
1356       }
1357     }
1358     nnz = lnk[0];
1359 
1360     /* if free space is not available, make more free space */
1361     if (current_space->local_remaining<nnz) {
1362       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1363       nspacedouble++;
1364     }
1365     /* copy data into free space, then initialize lnk */
1366     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1367     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1368 
1369     current_space->array           += nnz;
1370     current_space->local_used      += nnz;
1371     current_space->local_remaining -= nnz;
1372 
1373     bi[i+1] = bi[i] + nnz;
1374     if (nnz > rmax) rmax = nnz;
1375   }
1376   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1377 
1378   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1379   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1380 
1381   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1382   if (afill_tmp > afill) afill = afill_tmp;
1383   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1384   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1385   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1386 
1387   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1388   /*----------------------------------------------------------------------------------*/
1389   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1390 
1391   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1392   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1393   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1394   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1395   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1396   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1397   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1398   for (i=0; i<pn; i++) {
1399     row  = i + rstart;
1400     nnz  = bi[i+1] - bi[i];
1401     Jptr = bj + bi[i];
1402     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1403   }
1404   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1405   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1406   ierr = PetscFree(vals);CHKERRQ(ierr);
1407 
1408   merge->bi        = bi;
1409   merge->bj        = bj;
1410   merge->coi       = coi;
1411   merge->coj       = coj;
1412   merge->buf_ri    = buf_ri;
1413   merge->buf_rj    = buf_rj;
1414   merge->owners_co = owners_co;
1415   merge->destroy   = Cmpi->ops->destroy;
1416   merge->duplicate = Cmpi->ops->duplicate;
1417 
1418   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1419   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1420   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1421 
1422   /* attach the supporting struct to Cmpi for reuse */
1423   c           = (Mat_MPIAIJ*)Cmpi->data;
1424   c->ptap     = ptap;
1425   ptap->api   = NULL;
1426   ptap->apj   = NULL;
1427   ptap->merge = merge;
1428 
1429   *C = Cmpi;
1430 #if defined(PETSC_USE_INFO)
1431   if (bi[pn] != 0) {
1432     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1433     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1434   } else {
1435     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1436   }
1437 #endif
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 #undef __FUNCT__
1442 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1443 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1444 {
1445   PetscErrorCode      ierr;
1446   Mat_Merge_SeqsToMPI *merge;
1447   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1448   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1449   Mat_PtAPMPI         *ptap;
1450   PetscInt            *adj;
1451   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1452   MatScalar           *ada,*ca,valtmp;
1453   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1454   MPI_Comm            comm;
1455   PetscMPIInt         size,rank,taga,*len_s;
1456   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1457   PetscInt            **buf_ri,**buf_rj;
1458   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1459   MPI_Request         *s_waits,*r_waits;
1460   MPI_Status          *status;
1461   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1462   PetscInt            *ai,*aj,*coi,*coj;
1463   PetscInt            *poJ,*pdJ;
1464   Mat                 A_loc;
1465   Mat_SeqAIJ          *a_loc;
1466 
1467   PetscFunctionBegin;
1468   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1469   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1470   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1471 
1472   ptap  = c->ptap;
1473   merge = ptap->merge;
1474 
1475   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1476   /*------------------------------------------*/
1477   /* get data from symbolic products */
1478   coi    = merge->coi; coj = merge->coj;
1479   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1480   bi     = merge->bi; bj = merge->bj;
1481   owners = merge->rowmap->range;
1482   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1483 
1484   /* get A_loc by taking all local rows of A */
1485   A_loc = ptap->A_loc;
1486   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1487   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1488   ai    = a_loc->i;
1489   aj    = a_loc->j;
1490 
1491   for (i=0; i<am; i++) {
1492     anz = ai[i+1] - ai[i];
1493     adj = aj + ai[i];
1494     ada = a_loc->a + ai[i];
1495 
1496     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1497     /*-------------------------------------------------------------*/
1498     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1499     pnz = po->i[i+1] - po->i[i];
1500     poJ = po->j + po->i[i];
1501     pA  = po->a + po->i[i];
1502     for (j=0; j<pnz; j++) {
1503       row = poJ[j];
1504       cj  = coj + coi[row];
1505       ca  = coa + coi[row];
1506       /* perform sparse axpy */
1507       nexta  = 0;
1508       valtmp = pA[j];
1509       for (k=0; nexta<anz; k++) {
1510         if (cj[k] == adj[nexta]) {
1511           ca[k] += valtmp*ada[nexta];
1512           nexta++;
1513         }
1514       }
1515       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1516     }
1517 
1518     /* put the value into Cd (diagonal part) */
1519     pnz = pd->i[i+1] - pd->i[i];
1520     pdJ = pd->j + pd->i[i];
1521     pA  = pd->a + pd->i[i];
1522     for (j=0; j<pnz; j++) {
1523       row = pdJ[j];
1524       cj  = bj + bi[row];
1525       ca  = ba + bi[row];
1526       /* perform sparse axpy */
1527       nexta  = 0;
1528       valtmp = pA[j];
1529       for (k=0; nexta<anz; k++) {
1530         if (cj[k] == adj[nexta]) {
1531           ca[k] += valtmp*ada[nexta];
1532           nexta++;
1533         }
1534       }
1535       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1536     }
1537   }
1538 
1539   /* 3) send and recv matrix values coa */
1540   /*------------------------------------*/
1541   buf_ri = merge->buf_ri;
1542   buf_rj = merge->buf_rj;
1543   len_s  = merge->len_s;
1544   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1545   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1546 
1547   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1548   for (proc=0,k=0; proc<size; proc++) {
1549     if (!len_s[proc]) continue;
1550     i    = merge->owners_co[proc];
1551     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1552     k++;
1553   }
1554   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1555   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1556 
1557   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1558   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1559   ierr = PetscFree(coa);CHKERRQ(ierr);
1560 
1561   /* 4) insert local Cseq and received values into Cmpi */
1562   /*----------------------------------------------------*/
1563   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1564   for (k=0; k<merge->nrecv; k++) {
1565     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1566     nrows       = *(buf_ri_k[k]);
1567     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1568     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1569   }
1570 
1571   for (i=0; i<cm; i++) {
1572     row  = owners[rank] + i; /* global row index of C_seq */
1573     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1574     ba_i = ba + bi[i];
1575     bnz  = bi[i+1] - bi[i];
1576     /* add received vals into ba */
1577     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1578       /* i-th row */
1579       if (i == *nextrow[k]) {
1580         cnz    = *(nextci[k]+1) - *nextci[k];
1581         cj     = buf_rj[k] + *(nextci[k]);
1582         ca     = abuf_r[k] + *(nextci[k]);
1583         nextcj = 0;
1584         for (j=0; nextcj<cnz; j++) {
1585           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1586             ba_i[j] += ca[nextcj++];
1587           }
1588         }
1589         nextrow[k]++; nextci[k]++;
1590         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1591       }
1592     }
1593     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1594   }
1595   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1596   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1597 
1598   ierr = PetscFree(ba);CHKERRQ(ierr);
1599   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1600   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1601   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1606 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1607    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1608 #undef __FUNCT__
1609 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1610 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1611 {
1612   PetscErrorCode      ierr;
1613   Mat                 Cmpi,A_loc,POt,PDt;
1614   Mat_PtAPMPI         *ptap;
1615   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1616   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1617   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1618   PetscInt            nnz;
1619   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1620   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1621   MPI_Comm            comm;
1622   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1623   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1624   PetscInt            len,proc,*dnz,*onz,*owners;
1625   PetscInt            nzi,*bi,*bj;
1626   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1627   MPI_Request         *swaits,*rwaits;
1628   MPI_Status          *sstatus,rstatus;
1629   Mat_Merge_SeqsToMPI *merge;
1630   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1631   PetscReal           afill  =1.0,afill_tmp;
1632   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1633   PetscScalar         *vals;
1634   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1635   PetscTable          ta;
1636 
1637   PetscFunctionBegin;
1638   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1639   /* check if matrix local sizes are compatible */
1640   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1641 
1642   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1643   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1644 
1645   /* create struct Mat_PtAPMPI and attached it to C later */
1646   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1647 
1648   /* get A_loc by taking all local rows of A */
1649   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1650 
1651   ptap->A_loc = A_loc;
1652   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1653   ai          = a_loc->i;
1654   aj          = a_loc->j;
1655 
1656   /* determine symbolic Co=(p->B)^T*A - send to others */
1657   /*----------------------------------------------------*/
1658   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1659   pdt  = (Mat_SeqAIJ*)PDt->data;
1660   pdti = pdt->i; pdtj = pdt->j;
1661 
1662   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1663   pot  = (Mat_SeqAIJ*)POt->data;
1664   poti = pot->i; potj = pot->j;
1665 
1666   /* then, compute symbolic Co = (p->B)^T*A */
1667   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1668                          >= (num of nonzero rows of C_seq) - pn */
1669   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1670   coi[0] = 0;
1671 
1672   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1673   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1674   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1675   current_space = free_space;
1676 
1677   /* create and initialize a linked list */
1678   ierr = PetscTableCreate(2*a_loc->rmax,aN,&ta);CHKERRQ(ierr);
1679   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1680   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1681   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1682 
1683   for (i=0; i<pon; i++) {
1684     pnz = poti[i+1] - poti[i];
1685     ptJ = potj + poti[i];
1686     for (j=0; j<pnz; j++) {
1687       row  = ptJ[j]; /* row of A_loc == col of Pot */
1688       anz  = ai[row+1] - ai[row];
1689       Jptr = aj + ai[row];
1690       /* add non-zero cols of AP into the sorted linked list lnk */
1691       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1692     }
1693     nnz = lnk[0];
1694 
1695     /* If free space is not available, double the total space in the list */
1696     if (current_space->local_remaining<nnz) {
1697       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1698       nspacedouble++;
1699     }
1700 
1701     /* Copy data into free space, and zero out denserows */
1702     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1703 
1704     current_space->array           += nnz;
1705     current_space->local_used      += nnz;
1706     current_space->local_remaining -= nnz;
1707 
1708     coi[i+1] = coi[i] + nnz;
1709   }
1710 
1711   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1712   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1713   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1714 
1715   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1716   if (afill_tmp > afill) afill = afill_tmp;
1717 
1718   /* send j-array (coj) of Co to other processors */
1719   /*----------------------------------------------*/
1720   /* determine row ownership */
1721   ierr = PetscNew(&merge);CHKERRQ(ierr);
1722   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1723 
1724   merge->rowmap->n  = pn;
1725   merge->rowmap->bs = 1;
1726 
1727   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1728   owners = merge->rowmap->range;
1729 
1730   /* determine the number of messages to send, their lengths */
1731   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1732   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1733 
1734   len_s        = merge->len_s;
1735   merge->nsend = 0;
1736 
1737   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1738   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1739 
1740   proc = 0;
1741   for (i=0; i<pon; i++) {
1742     while (prmap[i] >= owners[proc+1]) proc++;
1743     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1744     len_s[proc] += coi[i+1] - coi[i];
1745   }
1746 
1747   len          = 0; /* max length of buf_si[] */
1748   owners_co[0] = 0;
1749   for (proc=0; proc<size; proc++) {
1750     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1751     if (len_si[proc]) {
1752       merge->nsend++;
1753       len_si[proc] = 2*(len_si[proc] + 1);
1754       len         += len_si[proc];
1755     }
1756   }
1757 
1758   /* determine the number and length of messages to receive for coi and coj  */
1759   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1760   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1761 
1762   /* post the Irecv and Isend of coj */
1763   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1764   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1765   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1766   for (proc=0, k=0; proc<size; proc++) {
1767     if (!len_s[proc]) continue;
1768     i    = owners_co[proc];
1769     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1770     k++;
1771   }
1772 
1773   /* receives and sends of coj are complete */
1774   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1775   for (i=0; i<merge->nrecv; i++) {
1776     PetscMPIInt icompleted;
1777     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1778   }
1779   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1780   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1781 
1782   /* add received column indices into table to update Armax */
1783   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1784     Jptr = buf_rj[k];
1785     for (j=0; j<merge->len_r[k]; j++) {
1786       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1787     }
1788   }
1789   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1790 
1791   /* send and recv coi */
1792   /*-------------------*/
1793   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1794   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1795   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1796   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1797   for (proc=0,k=0; proc<size; proc++) {
1798     if (!len_s[proc]) continue;
1799     /* form outgoing message for i-structure:
1800          buf_si[0]:                 nrows to be sent
1801                [1:nrows]:           row index (global)
1802                [nrows+1:2*nrows+1]: i-structure index
1803     */
1804     /*-------------------------------------------*/
1805     nrows       = len_si[proc]/2 - 1;
1806     buf_si_i    = buf_si + nrows+1;
1807     buf_si[0]   = nrows;
1808     buf_si_i[0] = 0;
1809     nrows       = 0;
1810     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1811       nzi               = coi[i+1] - coi[i];
1812       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1813       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1814       nrows++;
1815     }
1816     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1817     k++;
1818     buf_si += len_si[proc];
1819   }
1820   i = merge->nrecv;
1821   while (i--) {
1822     PetscMPIInt icompleted;
1823     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1824   }
1825   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1826   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1827   ierr = PetscFree(len_si);CHKERRQ(ierr);
1828   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1829   ierr = PetscFree(swaits);CHKERRQ(ierr);
1830   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1831   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1832 
1833   /* compute the local portion of C (mpi mat) */
1834   /*------------------------------------------*/
1835   /* allocate bi array and free space for accumulating nonzero column info */
1836   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1837   bi[0] = 0;
1838 
1839   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1840   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1841   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1842   current_space = free_space;
1843 
1844   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1845   for (k=0; k<merge->nrecv; k++) {
1846     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1847     nrows       = *buf_ri_k[k];
1848     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1849     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1850   }
1851 
1852   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1853   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1854   rmax = 0;
1855   for (i=0; i<pn; i++) {
1856     /* add pdt[i,:]*AP into lnk */
1857     pnz = pdti[i+1] - pdti[i];
1858     ptJ = pdtj + pdti[i];
1859     for (j=0; j<pnz; j++) {
1860       row  = ptJ[j];  /* row of AP == col of Pt */
1861       anz  = ai[row+1] - ai[row];
1862       Jptr = aj + ai[row];
1863       /* add non-zero cols of AP into the sorted linked list lnk */
1864       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1865     }
1866 
1867     /* add received col data into lnk */
1868     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1869       if (i == *nextrow[k]) { /* i-th row */
1870         nzi  = *(nextci[k]+1) - *nextci[k];
1871         Jptr = buf_rj[k] + *nextci[k];
1872         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1873         nextrow[k]++; nextci[k]++;
1874       }
1875     }
1876     nnz = lnk[0];
1877 
1878     /* if free space is not available, make more free space */
1879     if (current_space->local_remaining<nnz) {
1880       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1881       nspacedouble++;
1882     }
1883     /* copy data into free space, then initialize lnk */
1884     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1885     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1886 
1887     current_space->array           += nnz;
1888     current_space->local_used      += nnz;
1889     current_space->local_remaining -= nnz;
1890 
1891     bi[i+1] = bi[i] + nnz;
1892     if (nnz > rmax) rmax = nnz;
1893   }
1894   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1895 
1896   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1897   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1898   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1899   if (afill_tmp > afill) afill = afill_tmp;
1900   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1901   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1902 
1903   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1904   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1905 
1906   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1907   /*----------------------------------------------------------------------------------*/
1908   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1909 
1910   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1911   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1912   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1913   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1914   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1915   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1916   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1917   for (i=0; i<pn; i++) {
1918     row  = i + rstart;
1919     nnz  = bi[i+1] - bi[i];
1920     Jptr = bj + bi[i];
1921     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1922   }
1923   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1925   ierr = PetscFree(vals);CHKERRQ(ierr);
1926 
1927   merge->bi        = bi;
1928   merge->bj        = bj;
1929   merge->coi       = coi;
1930   merge->coj       = coj;
1931   merge->buf_ri    = buf_ri;
1932   merge->buf_rj    = buf_rj;
1933   merge->owners_co = owners_co;
1934   merge->destroy   = Cmpi->ops->destroy;
1935   merge->duplicate = Cmpi->ops->duplicate;
1936 
1937   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1938   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1939   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1940 
1941   /* attach the supporting struct to Cmpi for reuse */
1942   c = (Mat_MPIAIJ*)Cmpi->data;
1943 
1944   c->ptap     = ptap;
1945   ptap->api   = NULL;
1946   ptap->apj   = NULL;
1947   ptap->merge = merge;
1948   ptap->apa   = NULL;
1949 
1950   *C = Cmpi;
1951 #if defined(PETSC_USE_INFO)
1952   if (bi[pn] != 0) {
1953     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1954     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1955   } else {
1956     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1957   }
1958 #endif
1959   PetscFunctionReturn(0);
1960 }
1961