xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 2ec85ea456d4da44b4dd09fd939e615b8c7fb52b)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #if defined(PETSC_HAVE_HYPRE)
14 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
15 #endif
16 
17 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
18 {
19   PetscErrorCode ierr;
20 #if defined(PETSC_HAVE_HYPRE)
21   const char     *algTypes[3] = {"scalable","nonscalable","hypre"};
22   PetscInt       nalg = 3;
23 #else
24   const char     *algTypes[2] = {"scalable","nonscalable"};
25   PetscInt       nalg = 2;
26 #endif
27   PetscInt       alg = 1; /* set default algorithm */
28   MPI_Comm       comm;
29 
30   PetscFunctionBegin;
31   if (scall == MAT_INITIAL_MATRIX) {
32     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
33     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
34 
35     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
36     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[1],&alg,NULL);CHKERRQ(ierr);
37     ierr = PetscOptionsEnd();CHKERRQ(ierr);
38 
39     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
40     switch (alg) {
41     case 1:
42       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
43       break;
44 #if defined(PETSC_HAVE_HYPRE)
45     case 2:
46       ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
47       break;
48 #endif
49     default:
50       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
51       break;
52     }
53     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
54   }
55   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
56   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
57   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
62 {
63   PetscErrorCode ierr;
64   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
65   Mat_PtAPMPI    *ptap = a->ptap;
66 
67   PetscFunctionBegin;
68   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
69   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
70   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
71   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
72   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
73   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
74   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
75   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
76   ierr = ptap->destroy(A);CHKERRQ(ierr);
77   ierr = PetscFree(ptap);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
82 {
83   PetscErrorCode ierr;
84   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
85   Mat_PtAPMPI    *ptap = a->ptap;
86 
87   PetscFunctionBegin;
88   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
89 
90   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
91   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
96 {
97   PetscErrorCode ierr;
98   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
99   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
100   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
101   PetscScalar    *cda=cd->a,*coa=co->a;
102   Mat_SeqAIJ     *p_loc,*p_oth;
103   PetscScalar    *apa,*ca;
104   PetscInt       cm   =C->rmap->n;
105   Mat_PtAPMPI    *ptap=c->ptap;
106   PetscInt       *api,*apj,*apJ,i,k;
107   PetscInt       cstart=C->cmap->rstart;
108   PetscInt       cdnz,conz,k0,k1;
109   MPI_Comm       comm;
110   PetscMPIInt    size;
111 
112   PetscFunctionBegin;
113   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
114   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
115 
116   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
117   /*-----------------------------------------------------*/
118   /* update numerical values of P_oth and P_loc */
119   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
120   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
121 
122   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
123   /*----------------------------------------------------------*/
124   /* get data from symbolic products */
125   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
126   p_oth = NULL;
127   if (size >1) {
128     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
129   }
130 
131   /* get apa for storing dense row A[i,:]*P */
132   apa = ptap->apa;
133 
134   api = ptap->api;
135   apj = ptap->apj;
136   for (i=0; i<cm; i++) {
137     /* compute apa = A[i,:]*P */
138     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
139 
140     /* set values in C */
141     apJ  = apj + api[i];
142     cdnz = cd->i[i+1] - cd->i[i];
143     conz = co->i[i+1] - co->i[i];
144 
145     /* 1st off-diagoanl part of C */
146     ca = coa + co->i[i];
147     k  = 0;
148     for (k0=0; k0<conz; k0++) {
149       if (apJ[k] >= cstart) break;
150       ca[k0]      = apa[apJ[k]];
151       apa[apJ[k]] = 0.0;
152       k++;
153     }
154 
155     /* diagonal part of C */
156     ca = cda + cd->i[i];
157     for (k1=0; k1<cdnz; k1++) {
158       ca[k1]      = apa[apJ[k]];
159       apa[apJ[k]] = 0.0;
160       k++;
161     }
162 
163     /* 2nd off-diagoanl part of C */
164     ca = coa + co->i[i];
165     for (; k0<conz; k0++) {
166       ca[k0]      = apa[apJ[k]];
167       apa[apJ[k]] = 0.0;
168       k++;
169     }
170   }
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
177 {
178   PetscErrorCode     ierr;
179   MPI_Comm           comm;
180   PetscMPIInt        size;
181   Mat                Cmpi;
182   Mat_PtAPMPI        *ptap;
183   PetscFreeSpaceList free_space=NULL,current_space=NULL;
184   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
185   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
186   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
187   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
188   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
189   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
190   PetscBT            lnkbt;
191   PetscScalar        *apa;
192   PetscReal          afill;
193   PetscTable         ta;
194 
195   PetscFunctionBegin;
196   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
197   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
198 
199   /* create struct Mat_PtAPMPI and attached it to C later */
200   ierr = PetscNew(&ptap);CHKERRQ(ierr);
201 
202   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
203   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
204 
205   /* get P_loc by taking all local rows of P */
206   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
207 
208   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
209   pi_loc = p_loc->i; pj_loc = p_loc->j;
210   if (size > 1) {
211     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
212     pi_oth = p_oth->i; pj_oth = p_oth->j;
213   } else {
214     p_oth = NULL;
215     pi_oth = NULL; pj_oth = NULL;
216   }
217 
218   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
219   /*-------------------------------------------------------------------*/
220   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
221   ptap->api = api;
222   api[0]    = 0;
223 
224   /* create and initialize a linked list -- TODO: replace it with PetscBTCreate()! */
225   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr);
226   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
227   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
228   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
229   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
230 
231   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
232 
233   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
234   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
235   current_space = free_space;
236 
237   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
238   for (i=0; i<am; i++) {
239     /* diagonal portion of A */
240     nzi = adi[i+1] - adi[i];
241     for (j=0; j<nzi; j++) {
242       row  = *adj++;
243       pnz  = pi_loc[row+1] - pi_loc[row];
244       Jptr = pj_loc + pi_loc[row];
245       /* add non-zero cols of P into the sorted linked list lnk */
246       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
247     }
248     /* off-diagonal portion of A */
249     nzi = aoi[i+1] - aoi[i];
250     for (j=0; j<nzi; j++) {
251       row  = *aoj++;
252       pnz  = pi_oth[row+1] - pi_oth[row];
253       Jptr = pj_oth + pi_oth[row];
254       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
255     }
256 
257     apnz     = lnk[0];
258     api[i+1] = api[i] + apnz;
259 
260     /* if free space is not available, double the total space in the list */
261     if (current_space->local_remaining<apnz) {
262       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
263       nspacedouble++;
264     }
265 
266     /* Copy data into free space, then initialize lnk */
267     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
268     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
269 
270     current_space->array           += apnz;
271     current_space->local_used      += apnz;
272     current_space->local_remaining -= apnz;
273   }
274 
275   /* Allocate space for apj, initialize apj, and */
276   /* destroy list of free space and other temporary array(s) */
277   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
278   apj  = ptap->apj;
279   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
280   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
281 
282   /* malloc apa to store dense row A[i,:]*P */
283   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
284 
285   ptap->apa = apa;
286 
287   /* create and assemble symbolic parallel matrix Cmpi */
288   /*----------------------------------------------------*/
289   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
290   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
291   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
292 
293   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
294   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
295   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
296   for (i=0; i<am; i++) {
297     row  = i + rstart;
298     apnz = api[i+1] - api[i];
299     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
300     apj += apnz;
301   }
302   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
303   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
304 
305   ptap->destroy        = Cmpi->ops->destroy;
306   ptap->duplicate      = Cmpi->ops->duplicate;
307   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
308   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
309   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
310 
311   /* attach the supporting struct to Cmpi for reuse */
312   c       = (Mat_MPIAIJ*)Cmpi->data;
313   c->ptap = ptap;
314 
315   *C = Cmpi;
316 
317   /* set MatInfo */
318   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
319   if (afill < 1.0) afill = 1.0;
320   Cmpi->info.mallocs           = nspacedouble;
321   Cmpi->info.fill_ratio_given  = fill;
322   Cmpi->info.fill_ratio_needed = afill;
323 
324 #if defined(PETSC_USE_INFO)
325   if (api[am]) {
326     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
327     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
328   } else {
329     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
330   }
331 #endif
332   PetscFunctionReturn(0);
333 }
334 
335 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
336 {
337   PetscErrorCode ierr;
338 
339   PetscFunctionBegin;
340   if (scall == MAT_INITIAL_MATRIX) {
341     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
342     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
343     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
344   }
345   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
346   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
347   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 typedef struct {
352   Mat         workB;
353   PetscScalar *rvalues,*svalues;
354   MPI_Request *rwaits,*swaits;
355 } MPIAIJ_MPIDense;
356 
357 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
358 {
359   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
360   PetscErrorCode  ierr;
361 
362   PetscFunctionBegin;
363   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
364   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
365   ierr = PetscFree(contents);CHKERRQ(ierr);
366   PetscFunctionReturn(0);
367 }
368 
369 /*
370     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
371   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
372 
373   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
374 */
375 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
376 {
377   PetscErrorCode         ierr;
378   PetscBool              flg;
379   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
380   PetscInt               nz   = aij->B->cmap->n;
381   PetscContainer         container;
382   MPIAIJ_MPIDense        *contents;
383   VecScatter             ctx   = aij->Mvctx;
384   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
385   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
386 
387   PetscFunctionBegin;
388   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
389   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
390 
391   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
392   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
393   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
394 
395   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
396 
397   ierr = PetscNew(&contents);CHKERRQ(ierr);
398   /* Create work matrix used to store off processor rows of B needed for local product */
399   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
400   /* Create work arrays needed */
401   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
402                       B->cmap->N*to->starts[to->n],&contents->svalues,
403                       from->n,&contents->rwaits,
404                       to->n,&contents->swaits);CHKERRQ(ierr);
405 
406   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
407   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
408   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
409   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
410   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
411 
412   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
413   PetscFunctionReturn(0);
414 }
415 
416 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
417 {
418   PetscErrorCode         ierr;
419   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
420   PetscInt               nz   = aij->B->cmap->n;
421   PetscContainer         container;
422   MPIAIJ_MPIDense        *contents;
423   VecScatter             ctx   = aij->Mvctx;
424   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
425   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
426   PetscInt               m     = A->rmap->n,n=B->cmap->n;
427 
428   PetscFunctionBegin;
429   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
430   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
431   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
432   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
433   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
434   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
435   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
436 
437   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
438 
439   ierr = PetscNew(&contents);CHKERRQ(ierr);
440   /* Create work matrix used to store off processor rows of B needed for local product */
441   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
442   /* Create work arrays needed */
443   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
444                       B->cmap->N*to->starts[to->n],&contents->svalues,
445                       from->n,&contents->rwaits,
446                       to->n,&contents->swaits);CHKERRQ(ierr);
447 
448   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
449   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
450   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
451   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
452   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
453   PetscFunctionReturn(0);
454 }
455 
456 /*
457     Performs an efficient scatter on the rows of B needed by this process; this is
458     a modification of the VecScatterBegin_() routines.
459 */
460 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
461 {
462   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
463   PetscErrorCode         ierr;
464   PetscScalar            *b,*w,*svalues,*rvalues;
465   VecScatter             ctx   = aij->Mvctx;
466   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
467   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
468   PetscInt               i,j,k;
469   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
470   PetscMPIInt            *sprocs,*rprocs,nrecvs;
471   MPI_Request            *swaits,*rwaits;
472   MPI_Comm               comm;
473   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
474   MPI_Status             status;
475   MPIAIJ_MPIDense        *contents;
476   PetscContainer         container;
477   Mat                    workB;
478 
479   PetscFunctionBegin;
480   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
481   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
482   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
483   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
484 
485   workB = *outworkB = contents->workB;
486   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
487   sindices = to->indices;
488   sstarts  = to->starts;
489   sprocs   = to->procs;
490   swaits   = contents->swaits;
491   svalues  = contents->svalues;
492 
493   rindices = from->indices;
494   rstarts  = from->starts;
495   rprocs   = from->procs;
496   rwaits   = contents->rwaits;
497   rvalues  = contents->rvalues;
498 
499   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
500   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
501 
502   for (i=0; i<from->n; i++) {
503     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
504   }
505 
506   for (i=0; i<to->n; i++) {
507     /* pack a message at a time */
508     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
509       for (k=0; k<ncols; k++) {
510         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
511       }
512     }
513     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
514   }
515 
516   nrecvs = from->n;
517   while (nrecvs) {
518     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
519     nrecvs--;
520     /* unpack a message at a time */
521     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
522       for (k=0; k<ncols; k++) {
523         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
524       }
525     }
526   }
527   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
528 
529   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
530   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
531   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
532   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
536 
537 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
538 {
539   PetscErrorCode ierr;
540   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
541   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
542   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
543   Mat            workB;
544 
545   PetscFunctionBegin;
546   /* diagonal block of A times all local rows of B*/
547   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
548 
549   /* get off processor parts of B needed to complete the product */
550   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
551 
552   /* off-diagonal block of A times nonlocal rows of B */
553   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
554   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
555   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
556   PetscFunctionReturn(0);
557 }
558 
559 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
560 {
561   PetscErrorCode ierr;
562   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
563   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
564   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
565   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
566   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
567   Mat_SeqAIJ     *p_loc,*p_oth;
568   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
569   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
570   PetscInt       cm          = C->rmap->n,anz,pnz;
571   Mat_PtAPMPI    *ptap       = c->ptap;
572   PetscScalar    *apa_sparse = ptap->apa;
573   PetscInt       *api,*apj,*apJ,i,j,k,row;
574   PetscInt       cstart = C->cmap->rstart;
575   PetscInt       cdnz,conz,k0,k1,nextp;
576   MPI_Comm       comm;
577   PetscMPIInt    size;
578 
579   PetscFunctionBegin;
580   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
581   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
582 
583   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
584   /*-----------------------------------------------------*/
585   /* update numerical values of P_oth and P_loc */
586   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
587   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
588 
589   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
590   /*----------------------------------------------------------*/
591   /* get data from symbolic products */
592   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
593   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
594   if (size >1) {
595     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
596     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
597   } else {
598     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
599   }
600 
601   api = ptap->api;
602   apj = ptap->apj;
603   for (i=0; i<cm; i++) {
604     apJ = apj + api[i];
605 
606     /* diagonal portion of A */
607     anz = adi[i+1] - adi[i];
608     adj = ad->j + adi[i];
609     ada = ad->a + adi[i];
610     for (j=0; j<anz; j++) {
611       row = adj[j];
612       pnz = pi_loc[row+1] - pi_loc[row];
613       pj  = pj_loc + pi_loc[row];
614       pa  = pa_loc + pi_loc[row];
615       /* perform sparse axpy */
616       valtmp = ada[j];
617       nextp  = 0;
618       for (k=0; nextp<pnz; k++) {
619         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
620           apa_sparse[k] += valtmp*pa[nextp++];
621         }
622       }
623       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
624     }
625 
626     /* off-diagonal portion of A */
627     anz = aoi[i+1] - aoi[i];
628     aoj = ao->j + aoi[i];
629     aoa = ao->a + aoi[i];
630     for (j=0; j<anz; j++) {
631       row = aoj[j];
632       pnz = pi_oth[row+1] - pi_oth[row];
633       pj  = pj_oth + pi_oth[row];
634       pa  = pa_oth + pi_oth[row];
635       /* perform sparse axpy */
636       valtmp = aoa[j];
637       nextp  = 0;
638       for (k=0; nextp<pnz; k++) {
639         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
640           apa_sparse[k] += valtmp*pa[nextp++];
641         }
642       }
643       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
644     }
645 
646     /* set values in C */
647     cdnz = cd->i[i+1] - cd->i[i];
648     conz = co->i[i+1] - co->i[i];
649 
650     /* 1st off-diagoanl part of C */
651     ca = coa + co->i[i];
652     k  = 0;
653     for (k0=0; k0<conz; k0++) {
654       if (apJ[k] >= cstart) break;
655       ca[k0]        = apa_sparse[k];
656       apa_sparse[k] = 0.0;
657       k++;
658     }
659 
660     /* diagonal part of C */
661     ca = cda + cd->i[i];
662     for (k1=0; k1<cdnz; k1++) {
663       ca[k1]        = apa_sparse[k];
664       apa_sparse[k] = 0.0;
665       k++;
666     }
667 
668     /* 2nd off-diagoanl part of C */
669     ca = coa + co->i[i];
670     for (; k0<conz; k0++) {
671       ca[k0]        = apa_sparse[k];
672       apa_sparse[k] = 0.0;
673       k++;
674     }
675   }
676   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
677   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
682 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
683 {
684   PetscErrorCode     ierr;
685   MPI_Comm           comm;
686   PetscMPIInt        size;
687   Mat                Cmpi;
688   Mat_PtAPMPI        *ptap;
689   PetscFreeSpaceList free_space = NULL,current_space=NULL;
690   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
691   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
692   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
693   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
694   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
695   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
696   PetscReal          afill;
697   PetscScalar        *apa;
698   PetscTable         ta;
699 
700   PetscFunctionBegin;
701   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
702   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
703 
704   /* create struct Mat_PtAPMPI and attached it to C later */
705   ierr = PetscNew(&ptap);CHKERRQ(ierr);
706 
707   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
708   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
709 
710   /* get P_loc by taking all local rows of P */
711   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
712 
713   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
714   pi_loc = p_loc->i; pj_loc = p_loc->j;
715   if (size > 1) {
716     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
717     pi_oth = p_oth->i; pj_oth = p_oth->j;
718   } else {
719     p_oth  = NULL;
720     pi_oth = NULL; pj_oth = NULL;
721   }
722 
723   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
724   /*-------------------------------------------------------------------*/
725   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
726   ptap->api = api;
727   api[0]    = 0;
728 
729   /* create and initialize a linked list */
730   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr);
731 
732   /* Calculate apnz_max */
733   apnz_max = 0;
734   for (i=0; i<am; i++) {
735     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
736     /* diagonal portion of A */
737     nzi  = adi[i+1] - adi[i];
738     Jptr = adj+adi[i];  /* cols of A_diag */
739     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
740     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
741     if (apnz_max < apnz) apnz_max = apnz;
742 
743     /*  off-diagonal portion of A */
744     nzi = aoi[i+1] - aoi[i];
745     Jptr = aoj+aoi[i];  /* cols of A_off */
746     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
747     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
748     if (apnz_max < apnz) apnz_max = apnz;
749   }
750   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
751 
752   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
753 
754   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
755   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
756   current_space = free_space;
757   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
758   for (i=0; i<am; i++) {
759     /* diagonal portion of A */
760     nzi = adi[i+1] - adi[i];
761     for (j=0; j<nzi; j++) {
762       row  = *adj++;
763       pnz  = pi_loc[row+1] - pi_loc[row];
764       Jptr = pj_loc + pi_loc[row];
765       /* add non-zero cols of P into the sorted linked list lnk */
766       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
767     }
768     /* off-diagonal portion of A */
769     nzi = aoi[i+1] - aoi[i];
770     for (j=0; j<nzi; j++) {
771       row  = *aoj++;
772       pnz  = pi_oth[row+1] - pi_oth[row];
773       Jptr = pj_oth + pi_oth[row];
774       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
775     }
776 
777     apnz     = *lnk;
778     api[i+1] = api[i] + apnz;
779 
780     /* if free space is not available, double the total space in the list */
781     if (current_space->local_remaining<apnz) {
782       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
783       nspacedouble++;
784     }
785 
786     /* Copy data into free space, then initialize lnk */
787     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
788     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
789 
790     current_space->array           += apnz;
791     current_space->local_used      += apnz;
792     current_space->local_remaining -= apnz;
793   }
794 
795   /* Allocate space for apj, initialize apj, and */
796   /* destroy list of free space and other temporary array(s) */
797   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
798   apj  = ptap->apj;
799   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
800   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
801 
802   /* create and assemble symbolic parallel matrix Cmpi */
803   /*----------------------------------------------------*/
804   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
805   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
806   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
807   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
808   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
809   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
810 
811   /* malloc apa for assembly Cmpi */
812   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
813 
814   ptap->apa = apa;
815   for (i=0; i<am; i++) {
816     row  = i + rstart;
817     apnz = api[i+1] - api[i];
818     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
819     apj += apnz;
820   }
821   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
822   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
823 
824   ptap->destroy             = Cmpi->ops->destroy;
825   ptap->duplicate           = Cmpi->ops->duplicate;
826   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
827   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
828   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
829 
830   /* attach the supporting struct to Cmpi for reuse */
831   c       = (Mat_MPIAIJ*)Cmpi->data;
832   c->ptap = ptap;
833 
834   *C = Cmpi;
835 
836   /* set MatInfo */
837   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
838   if (afill < 1.0) afill = 1.0;
839   Cmpi->info.mallocs           = nspacedouble;
840   Cmpi->info.fill_ratio_given  = fill;
841   Cmpi->info.fill_ratio_needed = afill;
842 
843 #if defined(PETSC_USE_INFO)
844   if (api[am]) {
845     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
846     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
847   } else {
848     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
849   }
850 #endif
851   PetscFunctionReturn(0);
852 }
853 
854 /*-------------------------------------------------------------------------*/
855 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
856 {
857   PetscErrorCode ierr;
858   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
859   PetscInt       alg=0; /* set default algorithm */
860 
861   PetscFunctionBegin;
862   if (scall == MAT_INITIAL_MATRIX) {
863     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
864     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
865     ierr = PetscOptionsEnd();CHKERRQ(ierr);
866 
867     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
868     switch (alg) {
869     case 1:
870       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
871       break;
872     case 2:
873     {
874       Mat         Pt;
875       Mat_PtAPMPI *ptap;
876       Mat_MPIAIJ  *c;
877       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
878       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
879       c        = (Mat_MPIAIJ*)(*C)->data;
880       ptap     = c->ptap;
881       ptap->Pt = Pt;
882       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
883       PetscFunctionReturn(0);
884     }
885       break;
886     default:
887       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
888       break;
889     }
890     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
891   }
892   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
893   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
894   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
895   PetscFunctionReturn(0);
896 }
897 
898 /* This routine only works when scall=MAT_REUSE_MATRIX! */
899 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
900 {
901   PetscErrorCode ierr;
902   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
903   Mat_PtAPMPI    *ptap= c->ptap;
904   Mat            Pt=ptap->Pt;
905 
906   PetscFunctionBegin;
907   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
908   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
909   PetscFunctionReturn(0);
910 }
911 
912 /* Non-scalable version, use dense axpy */
913 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
914 {
915   PetscErrorCode      ierr;
916   Mat_Merge_SeqsToMPI *merge;
917   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
918   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
919   Mat_PtAPMPI         *ptap;
920   PetscInt            *adj,*aJ;
921   PetscInt            i,j,k,anz,pnz,row,*cj;
922   MatScalar           *ada,*aval,*ca,valtmp;
923   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
924   MPI_Comm            comm;
925   PetscMPIInt         size,rank,taga,*len_s;
926   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
927   PetscInt            **buf_ri,**buf_rj;
928   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
929   MPI_Request         *s_waits,*r_waits;
930   MPI_Status          *status;
931   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
932   PetscInt            *ai,*aj,*coi,*coj;
933   PetscInt            *poJ,*pdJ;
934   Mat                 A_loc;
935   Mat_SeqAIJ          *a_loc;
936 
937   PetscFunctionBegin;
938   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
939   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
940   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
941 
942   ptap  = c->ptap;
943   merge = ptap->merge;
944 
945   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
946   /*--------------------------------------------------------------*/
947   /* get data from symbolic products */
948   coi  = merge->coi; coj = merge->coj;
949   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
950 
951   bi     = merge->bi; bj = merge->bj;
952   owners = merge->rowmap->range;
953   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
954 
955   /* get A_loc by taking all local rows of A */
956   A_loc = ptap->A_loc;
957   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
958   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
959   ai    = a_loc->i;
960   aj    = a_loc->j;
961 
962   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
963 
964   for (i=0; i<am; i++) {
965     /* 2-a) put A[i,:] to dense array aval */
966     anz = ai[i+1] - ai[i];
967     adj = aj + ai[i];
968     ada = a_loc->a + ai[i];
969     for (j=0; j<anz; j++) {
970       aval[adj[j]] = ada[j];
971     }
972 
973     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
974     /*--------------------------------------------------------------*/
975     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
976     pnz = po->i[i+1] - po->i[i];
977     poJ = po->j + po->i[i];
978     pA  = po->a + po->i[i];
979     for (j=0; j<pnz; j++) {
980       row = poJ[j];
981       cnz = coi[row+1] - coi[row];
982       cj  = coj + coi[row];
983       ca  = coa + coi[row];
984       /* perform dense axpy */
985       valtmp = pA[j];
986       for (k=0; k<cnz; k++) {
987         ca[k] += valtmp*aval[cj[k]];
988       }
989       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
990     }
991 
992     /* put the value into Cd (diagonal part) */
993     pnz = pd->i[i+1] - pd->i[i];
994     pdJ = pd->j + pd->i[i];
995     pA  = pd->a + pd->i[i];
996     for (j=0; j<pnz; j++) {
997       row = pdJ[j];
998       cnz = bi[row+1] - bi[row];
999       cj  = bj + bi[row];
1000       ca  = ba + bi[row];
1001       /* perform dense axpy */
1002       valtmp = pA[j];
1003       for (k=0; k<cnz; k++) {
1004         ca[k] += valtmp*aval[cj[k]];
1005       }
1006       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1007     }
1008 
1009     /* zero the current row of Pt*A */
1010     aJ = aj + ai[i];
1011     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1012   }
1013 
1014   /* 3) send and recv matrix values coa */
1015   /*------------------------------------*/
1016   buf_ri = merge->buf_ri;
1017   buf_rj = merge->buf_rj;
1018   len_s  = merge->len_s;
1019   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1020   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1021 
1022   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1023   for (proc=0,k=0; proc<size; proc++) {
1024     if (!len_s[proc]) continue;
1025     i    = merge->owners_co[proc];
1026     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1027     k++;
1028   }
1029   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1030   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1031 
1032   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1033   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1034   ierr = PetscFree(coa);CHKERRQ(ierr);
1035 
1036   /* 4) insert local Cseq and received values into Cmpi */
1037   /*----------------------------------------------------*/
1038   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1039   for (k=0; k<merge->nrecv; k++) {
1040     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1041     nrows       = *(buf_ri_k[k]);
1042     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1043     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1044   }
1045 
1046   for (i=0; i<cm; i++) {
1047     row  = owners[rank] + i; /* global row index of C_seq */
1048     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1049     ba_i = ba + bi[i];
1050     bnz  = bi[i+1] - bi[i];
1051     /* add received vals into ba */
1052     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1053       /* i-th row */
1054       if (i == *nextrow[k]) {
1055         cnz    = *(nextci[k]+1) - *nextci[k];
1056         cj     = buf_rj[k] + *(nextci[k]);
1057         ca     = abuf_r[k] + *(nextci[k]);
1058         nextcj = 0;
1059         for (j=0; nextcj<cnz; j++) {
1060           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1061             ba_i[j] += ca[nextcj++];
1062           }
1063         }
1064         nextrow[k]++; nextci[k]++;
1065         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1066       }
1067     }
1068     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1069   }
1070   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1071   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1072 
1073   ierr = PetscFree(ba);CHKERRQ(ierr);
1074   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1075   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1076   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1077   ierr = PetscFree(aval);CHKERRQ(ierr);
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1082 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1083 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1084 {
1085   PetscErrorCode      ierr;
1086   Mat                 Cmpi,A_loc,POt,PDt;
1087   Mat_PtAPMPI         *ptap;
1088   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1089   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1090   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1091   PetscInt            nnz;
1092   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1093   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1094   PetscBT             lnkbt;
1095   MPI_Comm            comm;
1096   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1097   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1098   PetscInt            len,proc,*dnz,*onz,*owners;
1099   PetscInt            nzi,*bi,*bj;
1100   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1101   MPI_Request         *swaits,*rwaits;
1102   MPI_Status          *sstatus,rstatus;
1103   Mat_Merge_SeqsToMPI *merge;
1104   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1105   PetscReal           afill  =1.0,afill_tmp;
1106   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1107   PetscScalar         *vals;
1108   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1109 
1110   PetscFunctionBegin;
1111   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1112   /* check if matrix local sizes are compatible */
1113   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1114 
1115   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1116   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1117 
1118   /* create struct Mat_PtAPMPI and attached it to C later */
1119   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1120 
1121   /* get A_loc by taking all local rows of A */
1122   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1123 
1124   ptap->A_loc = A_loc;
1125 
1126   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1127   ai    = a_loc->i;
1128   aj    = a_loc->j;
1129 
1130   /* determine symbolic Co=(p->B)^T*A - send to others */
1131   /*----------------------------------------------------*/
1132   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1133   pdt  = (Mat_SeqAIJ*)PDt->data;
1134   pdti = pdt->i; pdtj = pdt->j;
1135 
1136   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1137   pot  = (Mat_SeqAIJ*)POt->data;
1138   poti = pot->i; potj = pot->j;
1139 
1140   /* then, compute symbolic Co = (p->B)^T*A */
1141   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1142   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1143   coi[0] = 0;
1144 
1145   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1146   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1147   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1148   current_space = free_space;
1149 
1150   /* create and initialize a linked list */
1151   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1152 
1153   for (i=0; i<pon; i++) {
1154     pnz = poti[i+1] - poti[i];
1155     ptJ = potj + poti[i];
1156     for (j=0; j<pnz; j++) {
1157       row  = ptJ[j]; /* row of A_loc == col of Pot */
1158       anz  = ai[row+1] - ai[row];
1159       Jptr = aj + ai[row];
1160       /* add non-zero cols of AP into the sorted linked list lnk */
1161       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1162     }
1163     nnz = lnk[0];
1164 
1165     /* If free space is not available, double the total space in the list */
1166     if (current_space->local_remaining<nnz) {
1167       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1168       nspacedouble++;
1169     }
1170 
1171     /* Copy data into free space, and zero out denserows */
1172     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1173 
1174     current_space->array           += nnz;
1175     current_space->local_used      += nnz;
1176     current_space->local_remaining -= nnz;
1177 
1178     coi[i+1] = coi[i] + nnz;
1179   }
1180 
1181   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1182   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1183 
1184   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1185   if (afill_tmp > afill) afill = afill_tmp;
1186 
1187   /* send j-array (coj) of Co to other processors */
1188   /*----------------------------------------------*/
1189   /* determine row ownership */
1190   ierr = PetscNew(&merge);CHKERRQ(ierr);
1191   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1192 
1193   merge->rowmap->n  = pn;
1194   merge->rowmap->bs = 1;
1195 
1196   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1197   owners = merge->rowmap->range;
1198 
1199   /* determine the number of messages to send, their lengths */
1200   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1201   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1202 
1203   len_s        = merge->len_s;
1204   merge->nsend = 0;
1205 
1206   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1207   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1208 
1209   proc = 0;
1210   for (i=0; i<pon; i++) {
1211     while (prmap[i] >= owners[proc+1]) proc++;
1212     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1213     len_s[proc] += coi[i+1] - coi[i];
1214   }
1215 
1216   len          = 0; /* max length of buf_si[] */
1217   owners_co[0] = 0;
1218   for (proc=0; proc<size; proc++) {
1219     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1220     if (len_si[proc]) {
1221       merge->nsend++;
1222       len_si[proc] = 2*(len_si[proc] + 1);
1223       len         += len_si[proc];
1224     }
1225   }
1226 
1227   /* determine the number and length of messages to receive for coi and coj  */
1228   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1229   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1230 
1231   /* post the Irecv and Isend of coj */
1232   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1233   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1234   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1235   for (proc=0, k=0; proc<size; proc++) {
1236     if (!len_s[proc]) continue;
1237     i    = owners_co[proc];
1238     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1239     k++;
1240   }
1241 
1242   /* receives and sends of coj are complete */
1243   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1244   for (i=0; i<merge->nrecv; i++) {
1245     PetscMPIInt icompleted;
1246     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1247   }
1248   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1249   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1250 
1251   /* send and recv coi */
1252   /*-------------------*/
1253   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1254   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1255   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1256   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1257   for (proc=0,k=0; proc<size; proc++) {
1258     if (!len_s[proc]) continue;
1259     /* form outgoing message for i-structure:
1260          buf_si[0]:                 nrows to be sent
1261                [1:nrows]:           row index (global)
1262                [nrows+1:2*nrows+1]: i-structure index
1263     */
1264     /*-------------------------------------------*/
1265     nrows       = len_si[proc]/2 - 1;
1266     buf_si_i    = buf_si + nrows+1;
1267     buf_si[0]   = nrows;
1268     buf_si_i[0] = 0;
1269     nrows       = 0;
1270     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1271       nzi               = coi[i+1] - coi[i];
1272       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1273       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1274       nrows++;
1275     }
1276     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1277     k++;
1278     buf_si += len_si[proc];
1279   }
1280   i = merge->nrecv;
1281   while (i--) {
1282     PetscMPIInt icompleted;
1283     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1284   }
1285   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1286   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1287   ierr = PetscFree(len_si);CHKERRQ(ierr);
1288   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1289   ierr = PetscFree(swaits);CHKERRQ(ierr);
1290   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1291   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1292 
1293   /* compute the local portion of C (mpi mat) */
1294   /*------------------------------------------*/
1295   /* allocate bi array and free space for accumulating nonzero column info */
1296   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1297   bi[0] = 0;
1298 
1299   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1300   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1301   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1302   current_space = free_space;
1303 
1304   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1305   for (k=0; k<merge->nrecv; k++) {
1306     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1307     nrows       = *buf_ri_k[k];
1308     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1309     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1310   }
1311 
1312   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1313   rmax = 0;
1314   for (i=0; i<pn; i++) {
1315     /* add pdt[i,:]*AP into lnk */
1316     pnz = pdti[i+1] - pdti[i];
1317     ptJ = pdtj + pdti[i];
1318     for (j=0; j<pnz; j++) {
1319       row  = ptJ[j];  /* row of AP == col of Pt */
1320       anz  = ai[row+1] - ai[row];
1321       Jptr = aj + ai[row];
1322       /* add non-zero cols of AP into the sorted linked list lnk */
1323       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1324     }
1325 
1326     /* add received col data into lnk */
1327     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1328       if (i == *nextrow[k]) { /* i-th row */
1329         nzi  = *(nextci[k]+1) - *nextci[k];
1330         Jptr = buf_rj[k] + *nextci[k];
1331         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1332         nextrow[k]++; nextci[k]++;
1333       }
1334     }
1335     nnz = lnk[0];
1336 
1337     /* if free space is not available, make more free space */
1338     if (current_space->local_remaining<nnz) {
1339       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1340       nspacedouble++;
1341     }
1342     /* copy data into free space, then initialize lnk */
1343     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1344     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1345 
1346     current_space->array           += nnz;
1347     current_space->local_used      += nnz;
1348     current_space->local_remaining -= nnz;
1349 
1350     bi[i+1] = bi[i] + nnz;
1351     if (nnz > rmax) rmax = nnz;
1352   }
1353   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1354 
1355   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1356   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1357 
1358   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1359   if (afill_tmp > afill) afill = afill_tmp;
1360   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1361   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1362   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1363 
1364   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1365   /*----------------------------------------------------------------------------------*/
1366   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1367 
1368   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1369   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1370   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1371   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1372   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1373   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1374   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1375   for (i=0; i<pn; i++) {
1376     row  = i + rstart;
1377     nnz  = bi[i+1] - bi[i];
1378     Jptr = bj + bi[i];
1379     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1380   }
1381   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1382   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1383   ierr = PetscFree(vals);CHKERRQ(ierr);
1384 
1385   merge->bi        = bi;
1386   merge->bj        = bj;
1387   merge->coi       = coi;
1388   merge->coj       = coj;
1389   merge->buf_ri    = buf_ri;
1390   merge->buf_rj    = buf_rj;
1391   merge->owners_co = owners_co;
1392 
1393   /* attach the supporting struct to Cmpi for reuse */
1394   c           = (Mat_MPIAIJ*)Cmpi->data;
1395   c->ptap     = ptap;
1396   ptap->api   = NULL;
1397   ptap->apj   = NULL;
1398   ptap->merge = merge;
1399   ptap->destroy   = Cmpi->ops->destroy;
1400   ptap->duplicate = Cmpi->ops->duplicate;
1401 
1402   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1403   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1404   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1405 
1406   *C = Cmpi;
1407 #if defined(PETSC_USE_INFO)
1408   if (bi[pn] != 0) {
1409     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1410     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1411   } else {
1412     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1413   }
1414 #endif
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1419 {
1420   PetscErrorCode      ierr;
1421   Mat_Merge_SeqsToMPI *merge;
1422   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1423   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1424   Mat_PtAPMPI         *ptap;
1425   PetscInt            *adj;
1426   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1427   MatScalar           *ada,*ca,valtmp;
1428   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1429   MPI_Comm            comm;
1430   PetscMPIInt         size,rank,taga,*len_s;
1431   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1432   PetscInt            **buf_ri,**buf_rj;
1433   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1434   MPI_Request         *s_waits,*r_waits;
1435   MPI_Status          *status;
1436   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1437   PetscInt            *ai,*aj,*coi,*coj;
1438   PetscInt            *poJ,*pdJ;
1439   Mat                 A_loc;
1440   Mat_SeqAIJ          *a_loc;
1441 
1442   PetscFunctionBegin;
1443   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1444   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1445   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1446 
1447   ptap  = c->ptap;
1448   merge = ptap->merge;
1449 
1450   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1451   /*------------------------------------------*/
1452   /* get data from symbolic products */
1453   coi    = merge->coi; coj = merge->coj;
1454   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1455   bi     = merge->bi; bj = merge->bj;
1456   owners = merge->rowmap->range;
1457   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1458 
1459   /* get A_loc by taking all local rows of A */
1460   A_loc = ptap->A_loc;
1461   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1462   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1463   ai    = a_loc->i;
1464   aj    = a_loc->j;
1465 
1466   for (i=0; i<am; i++) {
1467     anz = ai[i+1] - ai[i];
1468     adj = aj + ai[i];
1469     ada = a_loc->a + ai[i];
1470 
1471     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1472     /*-------------------------------------------------------------*/
1473     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1474     pnz = po->i[i+1] - po->i[i];
1475     poJ = po->j + po->i[i];
1476     pA  = po->a + po->i[i];
1477     for (j=0; j<pnz; j++) {
1478       row = poJ[j];
1479       cj  = coj + coi[row];
1480       ca  = coa + coi[row];
1481       /* perform sparse axpy */
1482       nexta  = 0;
1483       valtmp = pA[j];
1484       for (k=0; nexta<anz; k++) {
1485         if (cj[k] == adj[nexta]) {
1486           ca[k] += valtmp*ada[nexta];
1487           nexta++;
1488         }
1489       }
1490       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1491     }
1492 
1493     /* put the value into Cd (diagonal part) */
1494     pnz = pd->i[i+1] - pd->i[i];
1495     pdJ = pd->j + pd->i[i];
1496     pA  = pd->a + pd->i[i];
1497     for (j=0; j<pnz; j++) {
1498       row = pdJ[j];
1499       cj  = bj + bi[row];
1500       ca  = ba + bi[row];
1501       /* perform sparse axpy */
1502       nexta  = 0;
1503       valtmp = pA[j];
1504       for (k=0; nexta<anz; k++) {
1505         if (cj[k] == adj[nexta]) {
1506           ca[k] += valtmp*ada[nexta];
1507           nexta++;
1508         }
1509       }
1510       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1511     }
1512   }
1513 
1514   /* 3) send and recv matrix values coa */
1515   /*------------------------------------*/
1516   buf_ri = merge->buf_ri;
1517   buf_rj = merge->buf_rj;
1518   len_s  = merge->len_s;
1519   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1520   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1521 
1522   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1523   for (proc=0,k=0; proc<size; proc++) {
1524     if (!len_s[proc]) continue;
1525     i    = merge->owners_co[proc];
1526     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1527     k++;
1528   }
1529   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1530   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1531 
1532   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1533   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1534   ierr = PetscFree(coa);CHKERRQ(ierr);
1535 
1536   /* 4) insert local Cseq and received values into Cmpi */
1537   /*----------------------------------------------------*/
1538   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1539   for (k=0; k<merge->nrecv; k++) {
1540     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1541     nrows       = *(buf_ri_k[k]);
1542     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1543     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1544   }
1545 
1546   for (i=0; i<cm; i++) {
1547     row  = owners[rank] + i; /* global row index of C_seq */
1548     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1549     ba_i = ba + bi[i];
1550     bnz  = bi[i+1] - bi[i];
1551     /* add received vals into ba */
1552     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1553       /* i-th row */
1554       if (i == *nextrow[k]) {
1555         cnz    = *(nextci[k]+1) - *nextci[k];
1556         cj     = buf_rj[k] + *(nextci[k]);
1557         ca     = abuf_r[k] + *(nextci[k]);
1558         nextcj = 0;
1559         for (j=0; nextcj<cnz; j++) {
1560           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1561             ba_i[j] += ca[nextcj++];
1562           }
1563         }
1564         nextrow[k]++; nextci[k]++;
1565         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1566       }
1567     }
1568     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1569   }
1570   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1571   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1572 
1573   ierr = PetscFree(ba);CHKERRQ(ierr);
1574   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1575   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1576   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1581 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1582    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1583 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1584 {
1585   PetscErrorCode      ierr;
1586   Mat                 Cmpi,A_loc,POt,PDt;
1587   Mat_PtAPMPI         *ptap;
1588   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1589   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*a=(Mat_MPIAIJ*)A->data,*c;
1590   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1591   PetscInt            nnz;
1592   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1593   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1594   MPI_Comm            comm;
1595   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1596   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1597   PetscInt            len,proc,*dnz,*onz,*owners;
1598   PetscInt            nzi,*bi,*bj;
1599   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1600   MPI_Request         *swaits,*rwaits;
1601   MPI_Status          *sstatus,rstatus;
1602   Mat_Merge_SeqsToMPI *merge;
1603   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1604   PetscReal           afill  =1.0,afill_tmp;
1605   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1606   PetscScalar         *vals;
1607   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1608   PetscTable          ta;
1609 
1610   PetscFunctionBegin;
1611   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1612   /* check if matrix local sizes are compatible */
1613   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1614 
1615   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1616   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1617 
1618   /* create struct Mat_PtAPMPI and attached it to C later */
1619   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1620 
1621   /* get A_loc by taking all local rows of A */
1622   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1623 
1624   ptap->A_loc = A_loc;
1625   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1626   ai          = a_loc->i;
1627   aj          = a_loc->j;
1628 
1629   /* determine symbolic Co=(p->B)^T*A - send to others */
1630   /*----------------------------------------------------*/
1631   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1632   pdt  = (Mat_SeqAIJ*)PDt->data;
1633   pdti = pdt->i; pdtj = pdt->j;
1634 
1635   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1636   pot  = (Mat_SeqAIJ*)POt->data;
1637   poti = pot->i; potj = pot->j;
1638 
1639   /* then, compute symbolic Co = (p->B)^T*A */
1640   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1641                          >= (num of nonzero rows of C_seq) - pn */
1642   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1643   coi[0] = 0;
1644 
1645   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1646   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1647   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1648   current_space = free_space;
1649 
1650   /* create and initialize a linked list */
1651   ierr = PetscTableCreate(A->cmap->n + a->B->cmap->N,aN,&ta);CHKERRQ(ierr);
1652   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1653   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1654 
1655   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1656 
1657   for (i=0; i<pon; i++) {
1658     pnz = poti[i+1] - poti[i];
1659     ptJ = potj + poti[i];
1660     for (j=0; j<pnz; j++) {
1661       row  = ptJ[j]; /* row of A_loc == col of Pot */
1662       anz  = ai[row+1] - ai[row];
1663       Jptr = aj + ai[row];
1664       /* add non-zero cols of AP into the sorted linked list lnk */
1665       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1666     }
1667     nnz = lnk[0];
1668 
1669     /* If free space is not available, double the total space in the list */
1670     if (current_space->local_remaining<nnz) {
1671       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1672       nspacedouble++;
1673     }
1674 
1675     /* Copy data into free space, and zero out denserows */
1676     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1677 
1678     current_space->array           += nnz;
1679     current_space->local_used      += nnz;
1680     current_space->local_remaining -= nnz;
1681 
1682     coi[i+1] = coi[i] + nnz;
1683   }
1684 
1685   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1686   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1687   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1688 
1689   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1690   if (afill_tmp > afill) afill = afill_tmp;
1691 
1692   /* send j-array (coj) of Co to other processors */
1693   /*----------------------------------------------*/
1694   /* determine row ownership */
1695   ierr = PetscNew(&merge);CHKERRQ(ierr);
1696   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1697 
1698   merge->rowmap->n  = pn;
1699   merge->rowmap->bs = 1;
1700 
1701   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1702   owners = merge->rowmap->range;
1703 
1704   /* determine the number of messages to send, their lengths */
1705   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1706   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1707 
1708   len_s        = merge->len_s;
1709   merge->nsend = 0;
1710 
1711   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1712   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1713 
1714   proc = 0;
1715   for (i=0; i<pon; i++) {
1716     while (prmap[i] >= owners[proc+1]) proc++;
1717     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1718     len_s[proc] += coi[i+1] - coi[i];
1719   }
1720 
1721   len          = 0; /* max length of buf_si[] */
1722   owners_co[0] = 0;
1723   for (proc=0; proc<size; proc++) {
1724     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1725     if (len_si[proc]) {
1726       merge->nsend++;
1727       len_si[proc] = 2*(len_si[proc] + 1);
1728       len         += len_si[proc];
1729     }
1730   }
1731 
1732   /* determine the number and length of messages to receive for coi and coj  */
1733   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1734   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1735 
1736   /* post the Irecv and Isend of coj */
1737   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1738   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1739   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1740   for (proc=0, k=0; proc<size; proc++) {
1741     if (!len_s[proc]) continue;
1742     i    = owners_co[proc];
1743     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1744     k++;
1745   }
1746 
1747   /* receives and sends of coj are complete */
1748   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1749   for (i=0; i<merge->nrecv; i++) {
1750     PetscMPIInt icompleted;
1751     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1752   }
1753   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1754   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1755 
1756   /* add received column indices into table to update Armax */
1757   /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/examples/tutorials/ex56.c! */
1758   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1759     Jptr = buf_rj[k];
1760     for (j=0; j<merge->len_r[k]; j++) {
1761       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1762     }
1763   }
1764   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1765   /* printf("Armax %d, an %d + Bn %d = %d, aN %d\n",Armax,A->cmap->n,a->B->cmap->N,A->cmap->n+a->B->cmap->N,aN); */
1766 
1767   /* send and recv coi */
1768   /*-------------------*/
1769   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1770   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1771   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1772   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1773   for (proc=0,k=0; proc<size; proc++) {
1774     if (!len_s[proc]) continue;
1775     /* form outgoing message for i-structure:
1776          buf_si[0]:                 nrows to be sent
1777                [1:nrows]:           row index (global)
1778                [nrows+1:2*nrows+1]: i-structure index
1779     */
1780     /*-------------------------------------------*/
1781     nrows       = len_si[proc]/2 - 1;
1782     buf_si_i    = buf_si + nrows+1;
1783     buf_si[0]   = nrows;
1784     buf_si_i[0] = 0;
1785     nrows       = 0;
1786     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1787       nzi               = coi[i+1] - coi[i];
1788       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1789       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1790       nrows++;
1791     }
1792     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1793     k++;
1794     buf_si += len_si[proc];
1795   }
1796   i = merge->nrecv;
1797   while (i--) {
1798     PetscMPIInt icompleted;
1799     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1800   }
1801   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1802   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1803   ierr = PetscFree(len_si);CHKERRQ(ierr);
1804   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1805   ierr = PetscFree(swaits);CHKERRQ(ierr);
1806   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1807   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1808 
1809   /* compute the local portion of C (mpi mat) */
1810   /*------------------------------------------*/
1811   /* allocate bi array and free space for accumulating nonzero column info */
1812   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1813   bi[0] = 0;
1814 
1815   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1816   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1817   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1818   current_space = free_space;
1819 
1820   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1821   for (k=0; k<merge->nrecv; k++) {
1822     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1823     nrows       = *buf_ri_k[k];
1824     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1825     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1826   }
1827 
1828   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1829   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1830   rmax = 0;
1831   for (i=0; i<pn; i++) {
1832     /* add pdt[i,:]*AP into lnk */
1833     pnz = pdti[i+1] - pdti[i];
1834     ptJ = pdtj + pdti[i];
1835     for (j=0; j<pnz; j++) {
1836       row  = ptJ[j];  /* row of AP == col of Pt */
1837       anz  = ai[row+1] - ai[row];
1838       Jptr = aj + ai[row];
1839       /* add non-zero cols of AP into the sorted linked list lnk */
1840       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1841     }
1842 
1843     /* add received col data into lnk */
1844     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1845       if (i == *nextrow[k]) { /* i-th row */
1846         nzi  = *(nextci[k]+1) - *nextci[k];
1847         Jptr = buf_rj[k] + *nextci[k];
1848         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1849         nextrow[k]++; nextci[k]++;
1850       }
1851     }
1852     nnz = lnk[0];
1853 
1854     /* if free space is not available, make more free space */
1855     if (current_space->local_remaining<nnz) {
1856       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1857       nspacedouble++;
1858     }
1859     /* copy data into free space, then initialize lnk */
1860     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1861     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1862 
1863     current_space->array           += nnz;
1864     current_space->local_used      += nnz;
1865     current_space->local_remaining -= nnz;
1866 
1867     bi[i+1] = bi[i] + nnz;
1868     if (nnz > rmax) rmax = nnz;
1869   }
1870   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1871 
1872   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1873   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1874   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1875   if (afill_tmp > afill) afill = afill_tmp;
1876   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1877   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1878 
1879   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1880   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1881 
1882   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1883   /*----------------------------------------------------------------------------------*/
1884   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1885 
1886   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1887   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1888   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1889   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1890   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1891   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1892   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1893   for (i=0; i<pn; i++) {
1894     row  = i + rstart;
1895     nnz  = bi[i+1] - bi[i];
1896     Jptr = bj + bi[i];
1897     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1898   }
1899   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1900   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1901   ierr = PetscFree(vals);CHKERRQ(ierr);
1902 
1903   merge->bi        = bi;
1904   merge->bj        = bj;
1905   merge->coi       = coi;
1906   merge->coj       = coj;
1907   merge->buf_ri    = buf_ri;
1908   merge->buf_rj    = buf_rj;
1909   merge->owners_co = owners_co;
1910 
1911   /* attach the supporting struct to Cmpi for reuse */
1912   c = (Mat_MPIAIJ*)Cmpi->data;
1913 
1914   c->ptap     = ptap;
1915   ptap->api   = NULL;
1916   ptap->apj   = NULL;
1917   ptap->merge = merge;
1918   ptap->apa   = NULL;
1919   ptap->destroy   = Cmpi->ops->destroy;
1920   ptap->duplicate = Cmpi->ops->duplicate;
1921 
1922   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1923   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1924   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1925 
1926   *C = Cmpi;
1927 #if defined(PETSC_USE_INFO)
1928   if (bi[pn] != 0) {
1929     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1930     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1931   } else {
1932     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1933   }
1934 #endif
1935   PetscFunctionReturn(0);
1936 }
1937