xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 4c1069a6f7e44f78d90b41175756006e76a04441)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20   MPI_Comm       comm;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
25     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
26 
27     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
28     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsEnd();CHKERRQ(ierr);
30 
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     switch (alg) {
33     case 1:
34       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
35       break;
36     default:
37       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
38       break;
39     }
40     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
41   }
42   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
43   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
44   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 
48 #undef __FUNCT__
49 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
50 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51 {
52   PetscErrorCode ierr;
53   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
54   Mat_PtAPMPI    *ptap = a->ptap;
55 
56   PetscFunctionBegin;
57   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
59   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
60   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
61   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
62   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
63   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
64   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
65   ierr = ptap->destroy(A);CHKERRQ(ierr);
66   ierr = PetscFree(ptap);CHKERRQ(ierr);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
72 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73 {
74   PetscErrorCode ierr;
75   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
76   Mat_PtAPMPI    *ptap = a->ptap;
77 
78   PetscFunctionBegin;
79   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
80 
81   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83   PetscFunctionReturn(0);
84 }
85 
86 #undef __FUNCT__
87 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
88 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89 {
90   PetscErrorCode ierr;
91   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94   PetscScalar    *cda=cd->a,*coa=co->a;
95   Mat_SeqAIJ     *p_loc,*p_oth;
96   PetscScalar    *apa,*ca;
97   PetscInt       cm   =C->rmap->n;
98   Mat_PtAPMPI    *ptap=c->ptap;
99   PetscInt       *api,*apj,*apJ,i,k;
100   PetscInt       cstart=C->cmap->rstart;
101   PetscInt       cdnz,conz,k0,k1;
102   MPI_Comm       comm;
103   PetscMPIInt    size;
104 
105   PetscFunctionBegin;
106   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
107   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
108 
109   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
110   /*-----------------------------------------------------*/
111   /* update numerical values of P_oth and P_loc */
112   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
113   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
114 
115   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116   /*----------------------------------------------------------*/
117   /* get data from symbolic products */
118   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119   p_oth = NULL;
120   if (size >1) {
121     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122   }
123 
124   /* get apa for storing dense row A[i,:]*P */
125   apa = ptap->apa;
126 
127   api = ptap->api;
128   apj = ptap->apj;
129   for (i=0; i<cm; i++) {
130     /* compute apa = A[i,:]*P */
131     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
132 
133     /* set values in C */
134     apJ  = apj + api[i];
135     cdnz = cd->i[i+1] - cd->i[i];
136     conz = co->i[i+1] - co->i[i];
137 
138     /* 1st off-diagoanl part of C */
139     ca = coa + co->i[i];
140     k  = 0;
141     for (k0=0; k0<conz; k0++) {
142       if (apJ[k] >= cstart) break;
143       ca[k0]      = apa[apJ[k]];
144       apa[apJ[k]] = 0.0;
145       k++;
146     }
147 
148     /* diagonal part of C */
149     ca = cda + cd->i[i];
150     for (k1=0; k1<cdnz; k1++) {
151       ca[k1]      = apa[apJ[k]];
152       apa[apJ[k]] = 0.0;
153       k++;
154     }
155 
156     /* 2nd off-diagoanl part of C */
157     ca = coa + co->i[i];
158     for (; k0<conz; k0++) {
159       ca[k0]      = apa[apJ[k]];
160       apa[apJ[k]] = 0.0;
161       k++;
162     }
163   }
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
171 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172 {
173   PetscErrorCode     ierr;
174   MPI_Comm           comm;
175   PetscMPIInt        size;
176   Mat                Cmpi;
177   Mat_PtAPMPI        *ptap;
178   PetscFreeSpaceList free_space=NULL,current_space=NULL;
179   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
180   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185   PetscBT            lnkbt;
186   PetscScalar        *apa;
187   PetscReal          afill;
188   PetscTable         ta;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
192   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
193 
194   /* create struct Mat_PtAPMPI and attached it to C later */
195   ierr = PetscNew(&ptap);CHKERRQ(ierr);
196 
197   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
199 
200   /* get P_loc by taking all local rows of P */
201   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
202 
203   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204   pi_loc = p_loc->i; pj_loc = p_loc->j;
205   if (size > 1) {
206     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207     pi_oth = p_oth->i; pj_oth = p_oth->j;
208   } else {
209     p_oth = NULL;
210     pi_oth = NULL; pj_oth = NULL;
211   }
212 
213   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214   /*-------------------------------------------------------------------*/
215   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
216   ptap->api = api;
217   api[0]    = 0;
218 
219   /* create and initialize a linked list -- TODO: replace it with PetscBTCreate()! */
220   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr);
221   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
222   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
223   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
224   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
225 
226   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
227 
228   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
229   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
230   current_space = free_space;
231 
232   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
233   for (i=0; i<am; i++) {
234     /* diagonal portion of A */
235     nzi = adi[i+1] - adi[i];
236     for (j=0; j<nzi; j++) {
237       row  = *adj++;
238       pnz  = pi_loc[row+1] - pi_loc[row];
239       Jptr = pj_loc + pi_loc[row];
240       /* add non-zero cols of P into the sorted linked list lnk */
241       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
242     }
243     /* off-diagonal portion of A */
244     nzi = aoi[i+1] - aoi[i];
245     for (j=0; j<nzi; j++) {
246       row  = *aoj++;
247       pnz  = pi_oth[row+1] - pi_oth[row];
248       Jptr = pj_oth + pi_oth[row];
249       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
250     }
251 
252     apnz     = lnk[0];
253     api[i+1] = api[i] + apnz;
254 
255     /* if free space is not available, double the total space in the list */
256     if (current_space->local_remaining<apnz) {
257       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
258       nspacedouble++;
259     }
260 
261     /* Copy data into free space, then initialize lnk */
262     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
263     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
264 
265     current_space->array           += apnz;
266     current_space->local_used      += apnz;
267     current_space->local_remaining -= apnz;
268   }
269 
270   /* Allocate space for apj, initialize apj, and */
271   /* destroy list of free space and other temporary array(s) */
272   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
273   apj  = ptap->apj;
274   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
275   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
276 
277   /* malloc apa to store dense row A[i,:]*P */
278   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
279 
280   ptap->apa = apa;
281 
282   /* create and assemble symbolic parallel matrix Cmpi */
283   /*----------------------------------------------------*/
284   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
285   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
286   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
287 
288   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
289   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
290   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
291   for (i=0; i<am; i++) {
292     row  = i + rstart;
293     apnz = api[i+1] - api[i];
294     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
295     apj += apnz;
296   }
297   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
298   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
299 
300   ptap->destroy        = Cmpi->ops->destroy;
301   ptap->duplicate      = Cmpi->ops->duplicate;
302   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
303   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
304   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
305 
306   /* attach the supporting struct to Cmpi for reuse */
307   c       = (Mat_MPIAIJ*)Cmpi->data;
308   c->ptap = ptap;
309 
310   *C = Cmpi;
311 
312   /* set MatInfo */
313   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
314   if (afill < 1.0) afill = 1.0;
315   Cmpi->info.mallocs           = nspacedouble;
316   Cmpi->info.fill_ratio_given  = fill;
317   Cmpi->info.fill_ratio_needed = afill;
318 
319 #if defined(PETSC_USE_INFO)
320   if (api[am]) {
321     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
322     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
323   } else {
324     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
325   }
326 #endif
327   PetscFunctionReturn(0);
328 }
329 
330 #undef __FUNCT__
331 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
332 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
333 {
334   PetscErrorCode ierr;
335 
336   PetscFunctionBegin;
337   if (scall == MAT_INITIAL_MATRIX) {
338     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
339     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
340     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
341   }
342   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
343   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
344   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 typedef struct {
349   Mat         workB;
350   PetscScalar *rvalues,*svalues;
351   MPI_Request *rwaits,*swaits;
352 } MPIAIJ_MPIDense;
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
356 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
357 {
358   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
359   PetscErrorCode  ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
363   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
364   ierr = PetscFree(contents);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 
368 #undef __FUNCT__
369 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
370 /*
371     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
372   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
373 
374   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
375 */
376 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
377 {
378   PetscErrorCode         ierr;
379   PetscBool              flg;
380   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
381   PetscInt               nz   = aij->B->cmap->n;
382   PetscContainer         container;
383   MPIAIJ_MPIDense        *contents;
384   VecScatter             ctx   = aij->Mvctx;
385   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
386   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
387 
388   PetscFunctionBegin;
389   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
390   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
391 
392   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
393   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
394   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
395 
396   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
397 
398   ierr = PetscNew(&contents);CHKERRQ(ierr);
399   /* Create work matrix used to store off processor rows of B needed for local product */
400   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
401   /* Create work arrays needed */
402   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
403                       B->cmap->N*to->starts[to->n],&contents->svalues,
404                       from->n,&contents->rwaits,
405                       to->n,&contents->swaits);CHKERRQ(ierr);
406 
407   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
408   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
409   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
410   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
411   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
412 
413   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
417 #undef __FUNCT__
418 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
419 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
420 {
421   PetscErrorCode         ierr;
422   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
423   PetscInt               nz   = aij->B->cmap->n;
424   PetscContainer         container;
425   MPIAIJ_MPIDense        *contents;
426   VecScatter             ctx   = aij->Mvctx;
427   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
428   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
429   PetscInt               m     = A->rmap->n,n=B->cmap->n;
430 
431   PetscFunctionBegin;
432   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
433   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
434   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
435   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
436   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
437   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
438   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
439 
440   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
441 
442   ierr = PetscNew(&contents);CHKERRQ(ierr);
443   /* Create work matrix used to store off processor rows of B needed for local product */
444   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
445   /* Create work arrays needed */
446   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
447                       B->cmap->N*to->starts[to->n],&contents->svalues,
448                       from->n,&contents->rwaits,
449                       to->n,&contents->swaits);CHKERRQ(ierr);
450 
451   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
452   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
453   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
454   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
455   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNCT__
460 #define __FUNCT__ "MatMPIDenseScatter"
461 /*
462     Performs an efficient scatter on the rows of B needed by this process; this is
463     a modification of the VecScatterBegin_() routines.
464 */
465 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
466 {
467   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
468   PetscErrorCode         ierr;
469   PetscScalar            *b,*w,*svalues,*rvalues;
470   VecScatter             ctx   = aij->Mvctx;
471   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
472   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
473   PetscInt               i,j,k;
474   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
475   PetscMPIInt            *sprocs,*rprocs,nrecvs;
476   MPI_Request            *swaits,*rwaits;
477   MPI_Comm               comm;
478   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
479   MPI_Status             status;
480   MPIAIJ_MPIDense        *contents;
481   PetscContainer         container;
482   Mat                    workB;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
486   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
487   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
488   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
489 
490   workB = *outworkB = contents->workB;
491   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
492   sindices = to->indices;
493   sstarts  = to->starts;
494   sprocs   = to->procs;
495   swaits   = contents->swaits;
496   svalues  = contents->svalues;
497 
498   rindices = from->indices;
499   rstarts  = from->starts;
500   rprocs   = from->procs;
501   rwaits   = contents->rwaits;
502   rvalues  = contents->rvalues;
503 
504   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
505   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
506 
507   for (i=0; i<from->n; i++) {
508     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
509   }
510 
511   for (i=0; i<to->n; i++) {
512     /* pack a message at a time */
513     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
514       for (k=0; k<ncols; k++) {
515         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
516       }
517     }
518     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
519   }
520 
521   nrecvs = from->n;
522   while (nrecvs) {
523     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
524     nrecvs--;
525     /* unpack a message at a time */
526     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
527       for (k=0; k<ncols; k++) {
528         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
529       }
530     }
531   }
532   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
533 
534   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
535   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
536   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
537   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
538   PetscFunctionReturn(0);
539 }
540 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
541 
542 #undef __FUNCT__
543 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
544 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
545 {
546   PetscErrorCode ierr;
547   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
548   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
549   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
550   Mat            workB;
551 
552   PetscFunctionBegin;
553   /* diagonal block of A times all local rows of B*/
554   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
555 
556   /* get off processor parts of B needed to complete the product */
557   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
558 
559   /* off-diagonal block of A times nonlocal rows of B */
560   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
561   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
562   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
563   PetscFunctionReturn(0);
564 }
565 
566 #undef __FUNCT__
567 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
568 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
569 {
570   PetscErrorCode ierr;
571   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
572   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
573   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
574   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
575   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
576   Mat_SeqAIJ     *p_loc,*p_oth;
577   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
578   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
579   PetscInt       cm          = C->rmap->n,anz,pnz;
580   Mat_PtAPMPI    *ptap       = c->ptap;
581   PetscScalar    *apa_sparse = ptap->apa;
582   PetscInt       *api,*apj,*apJ,i,j,k,row;
583   PetscInt       cstart = C->cmap->rstart;
584   PetscInt       cdnz,conz,k0,k1,nextp;
585   MPI_Comm       comm;
586   PetscMPIInt    size;
587 
588   PetscFunctionBegin;
589   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
590   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
591 
592   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
593   /*-----------------------------------------------------*/
594   /* update numerical values of P_oth and P_loc */
595   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
596   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
597 
598   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
599   /*----------------------------------------------------------*/
600   /* get data from symbolic products */
601   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
602   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
603   if (size >1) {
604     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
605     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
606   } else {
607     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
608   }
609 
610   api = ptap->api;
611   apj = ptap->apj;
612   for (i=0; i<cm; i++) {
613     apJ = apj + api[i];
614 
615     /* diagonal portion of A */
616     anz = adi[i+1] - adi[i];
617     adj = ad->j + adi[i];
618     ada = ad->a + adi[i];
619     for (j=0; j<anz; j++) {
620       row = adj[j];
621       pnz = pi_loc[row+1] - pi_loc[row];
622       pj  = pj_loc + pi_loc[row];
623       pa  = pa_loc + pi_loc[row];
624       /* perform sparse axpy */
625       valtmp = ada[j];
626       nextp  = 0;
627       for (k=0; nextp<pnz; k++) {
628         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
629           apa_sparse[k] += valtmp*pa[nextp++];
630         }
631       }
632       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
633     }
634 
635     /* off-diagonal portion of A */
636     anz = aoi[i+1] - aoi[i];
637     aoj = ao->j + aoi[i];
638     aoa = ao->a + aoi[i];
639     for (j=0; j<anz; j++) {
640       row = aoj[j];
641       pnz = pi_oth[row+1] - pi_oth[row];
642       pj  = pj_oth + pi_oth[row];
643       pa  = pa_oth + pi_oth[row];
644       /* perform sparse axpy */
645       valtmp = aoa[j];
646       nextp  = 0;
647       for (k=0; nextp<pnz; k++) {
648         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
649           apa_sparse[k] += valtmp*pa[nextp++];
650         }
651       }
652       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
653     }
654 
655     /* set values in C */
656     cdnz = cd->i[i+1] - cd->i[i];
657     conz = co->i[i+1] - co->i[i];
658 
659     /* 1st off-diagoanl part of C */
660     ca = coa + co->i[i];
661     k  = 0;
662     for (k0=0; k0<conz; k0++) {
663       if (apJ[k] >= cstart) break;
664       ca[k0]        = apa_sparse[k];
665       apa_sparse[k] = 0.0;
666       k++;
667     }
668 
669     /* diagonal part of C */
670     ca = cda + cd->i[i];
671     for (k1=0; k1<cdnz; k1++) {
672       ca[k1]        = apa_sparse[k];
673       apa_sparse[k] = 0.0;
674       k++;
675     }
676 
677     /* 2nd off-diagoanl part of C */
678     ca = coa + co->i[i];
679     for (; k0<conz; k0++) {
680       ca[k0]        = apa_sparse[k];
681       apa_sparse[k] = 0.0;
682       k++;
683     }
684   }
685   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
686   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
687   PetscFunctionReturn(0);
688 }
689 
690 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
691 #undef __FUNCT__
692 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
693 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
694 {
695   PetscErrorCode     ierr;
696   MPI_Comm           comm;
697   PetscMPIInt        size;
698   Mat                Cmpi;
699   Mat_PtAPMPI        *ptap;
700   PetscFreeSpaceList free_space = NULL,current_space=NULL;
701   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
702   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
703   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
704   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
705   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
706   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
707   PetscReal          afill;
708   PetscScalar        *apa;
709   PetscTable         ta;
710 
711   PetscFunctionBegin;
712   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
713   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
714 
715   /* create struct Mat_PtAPMPI and attached it to C later */
716   ierr = PetscNew(&ptap);CHKERRQ(ierr);
717 
718   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
719   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
720 
721   /* get P_loc by taking all local rows of P */
722   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
723 
724   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
725   pi_loc = p_loc->i; pj_loc = p_loc->j;
726   if (size > 1) {
727     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
728     pi_oth = p_oth->i; pj_oth = p_oth->j;
729   } else {
730     p_oth  = NULL;
731     pi_oth = NULL; pj_oth = NULL;
732   }
733 
734   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
735   /*-------------------------------------------------------------------*/
736   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
737   ptap->api = api;
738   api[0]    = 0;
739 
740   /* create and initialize a linked list */
741   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr);
742 
743   /* Calculate apnz_max */
744   apnz_max = 0;
745   for (i=0; i<am; i++) {
746     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
747     /* diagonal portion of A */
748     nzi  = adi[i+1] - adi[i];
749     Jptr = adj+adi[i];  /* cols of A_diag */
750     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
751     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
752     if (apnz_max < apnz) apnz_max = apnz;
753 
754     /*  off-diagonal portion of A */
755     nzi = aoi[i+1] - aoi[i];
756     Jptr = aoj+aoi[i];  /* cols of A_off */
757     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
758     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
759     if (apnz_max < apnz) apnz_max = apnz;
760   }
761   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
762 
763   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
764 
765   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
766   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
767   current_space = free_space;
768   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
769   for (i=0; i<am; i++) {
770     /* diagonal portion of A */
771     nzi = adi[i+1] - adi[i];
772     for (j=0; j<nzi; j++) {
773       row  = *adj++;
774       pnz  = pi_loc[row+1] - pi_loc[row];
775       Jptr = pj_loc + pi_loc[row];
776       /* add non-zero cols of P into the sorted linked list lnk */
777       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
778     }
779     /* off-diagonal portion of A */
780     nzi = aoi[i+1] - aoi[i];
781     for (j=0; j<nzi; j++) {
782       row  = *aoj++;
783       pnz  = pi_oth[row+1] - pi_oth[row];
784       Jptr = pj_oth + pi_oth[row];
785       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
786     }
787 
788     apnz     = *lnk;
789     api[i+1] = api[i] + apnz;
790 
791     /* if free space is not available, double the total space in the list */
792     if (current_space->local_remaining<apnz) {
793       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
794       nspacedouble++;
795     }
796 
797     /* Copy data into free space, then initialize lnk */
798     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
799     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
800 
801     current_space->array           += apnz;
802     current_space->local_used      += apnz;
803     current_space->local_remaining -= apnz;
804   }
805 
806   /* Allocate space for apj, initialize apj, and */
807   /* destroy list of free space and other temporary array(s) */
808   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
809   apj  = ptap->apj;
810   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
811   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
812 
813   /* create and assemble symbolic parallel matrix Cmpi */
814   /*----------------------------------------------------*/
815   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
816   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
817   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
818   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
819   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
820   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
821 
822   /* malloc apa for assembly Cmpi */
823   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
824 
825   ptap->apa = apa;
826   for (i=0; i<am; i++) {
827     row  = i + rstart;
828     apnz = api[i+1] - api[i];
829     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
830     apj += apnz;
831   }
832   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
833   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
834 
835   ptap->destroy             = Cmpi->ops->destroy;
836   ptap->duplicate           = Cmpi->ops->duplicate;
837   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
838   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
839   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
840 
841   /* attach the supporting struct to Cmpi for reuse */
842   c       = (Mat_MPIAIJ*)Cmpi->data;
843   c->ptap = ptap;
844 
845   *C = Cmpi;
846 
847   /* set MatInfo */
848   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
849   if (afill < 1.0) afill = 1.0;
850   Cmpi->info.mallocs           = nspacedouble;
851   Cmpi->info.fill_ratio_given  = fill;
852   Cmpi->info.fill_ratio_needed = afill;
853 
854 #if defined(PETSC_USE_INFO)
855   if (api[am]) {
856     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
857     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
858   } else {
859     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
860   }
861 #endif
862   PetscFunctionReturn(0);
863 }
864 
865 /*-------------------------------------------------------------------------*/
866 #undef __FUNCT__
867 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
868 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
869 {
870   PetscErrorCode ierr;
871   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
872   PetscInt       alg=0; /* set default algorithm */
873 
874   PetscFunctionBegin;
875   if (scall == MAT_INITIAL_MATRIX) {
876     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
877     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
878     ierr = PetscOptionsEnd();CHKERRQ(ierr);
879 
880     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
881     switch (alg) {
882     case 1:
883       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
884       break;
885     case 2:
886     {
887       Mat         Pt;
888       Mat_PtAPMPI *ptap;
889       Mat_MPIAIJ  *c;
890       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
891       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
892       c        = (Mat_MPIAIJ*)(*C)->data;
893       ptap     = c->ptap;
894       ptap->Pt = Pt;
895       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
896       PetscFunctionReturn(0);
897     }
898       break;
899     default:
900       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
901       break;
902     }
903     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
904   }
905   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
906   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
907   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
908   PetscFunctionReturn(0);
909 }
910 
911 /* This routine only works when scall=MAT_REUSE_MATRIX! */
912 #undef __FUNCT__
913 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
914 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
915 {
916   PetscErrorCode ierr;
917   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
918   Mat_PtAPMPI    *ptap= c->ptap;
919   Mat            Pt=ptap->Pt;
920 
921   PetscFunctionBegin;
922   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
923   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
924   PetscFunctionReturn(0);
925 }
926 
927 /* Non-scalable version, use dense axpy */
928 #undef __FUNCT__
929 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
930 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
931 {
932   PetscErrorCode      ierr;
933   Mat_Merge_SeqsToMPI *merge;
934   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
935   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
936   Mat_PtAPMPI         *ptap;
937   PetscInt            *adj,*aJ;
938   PetscInt            i,j,k,anz,pnz,row,*cj;
939   MatScalar           *ada,*aval,*ca,valtmp;
940   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
941   MPI_Comm            comm;
942   PetscMPIInt         size,rank,taga,*len_s;
943   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
944   PetscInt            **buf_ri,**buf_rj;
945   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
946   MPI_Request         *s_waits,*r_waits;
947   MPI_Status          *status;
948   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
949   PetscInt            *ai,*aj,*coi,*coj;
950   PetscInt            *poJ,*pdJ;
951   Mat                 A_loc;
952   Mat_SeqAIJ          *a_loc;
953 
954   PetscFunctionBegin;
955   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
956   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
957   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
958 
959   ptap  = c->ptap;
960   merge = ptap->merge;
961 
962   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
963   /*--------------------------------------------------------------*/
964   /* get data from symbolic products */
965   coi  = merge->coi; coj = merge->coj;
966   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
967 
968   bi     = merge->bi; bj = merge->bj;
969   owners = merge->rowmap->range;
970   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
971 
972   /* get A_loc by taking all local rows of A */
973   A_loc = ptap->A_loc;
974   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
975   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
976   ai    = a_loc->i;
977   aj    = a_loc->j;
978 
979   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
980 
981   for (i=0; i<am; i++) {
982     /* 2-a) put A[i,:] to dense array aval */
983     anz = ai[i+1] - ai[i];
984     adj = aj + ai[i];
985     ada = a_loc->a + ai[i];
986     for (j=0; j<anz; j++) {
987       aval[adj[j]] = ada[j];
988     }
989 
990     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
991     /*--------------------------------------------------------------*/
992     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
993     pnz = po->i[i+1] - po->i[i];
994     poJ = po->j + po->i[i];
995     pA  = po->a + po->i[i];
996     for (j=0; j<pnz; j++) {
997       row = poJ[j];
998       cnz = coi[row+1] - coi[row];
999       cj  = coj + coi[row];
1000       ca  = coa + coi[row];
1001       /* perform dense axpy */
1002       valtmp = pA[j];
1003       for (k=0; k<cnz; k++) {
1004         ca[k] += valtmp*aval[cj[k]];
1005       }
1006       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1007     }
1008 
1009     /* put the value into Cd (diagonal part) */
1010     pnz = pd->i[i+1] - pd->i[i];
1011     pdJ = pd->j + pd->i[i];
1012     pA  = pd->a + pd->i[i];
1013     for (j=0; j<pnz; j++) {
1014       row = pdJ[j];
1015       cnz = bi[row+1] - bi[row];
1016       cj  = bj + bi[row];
1017       ca  = ba + bi[row];
1018       /* perform dense axpy */
1019       valtmp = pA[j];
1020       for (k=0; k<cnz; k++) {
1021         ca[k] += valtmp*aval[cj[k]];
1022       }
1023       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1024     }
1025 
1026     /* zero the current row of Pt*A */
1027     aJ = aj + ai[i];
1028     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1029   }
1030 
1031   /* 3) send and recv matrix values coa */
1032   /*------------------------------------*/
1033   buf_ri = merge->buf_ri;
1034   buf_rj = merge->buf_rj;
1035   len_s  = merge->len_s;
1036   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1037   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1038 
1039   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1040   for (proc=0,k=0; proc<size; proc++) {
1041     if (!len_s[proc]) continue;
1042     i    = merge->owners_co[proc];
1043     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1044     k++;
1045   }
1046   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1047   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1048 
1049   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1050   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1051   ierr = PetscFree(coa);CHKERRQ(ierr);
1052 
1053   /* 4) insert local Cseq and received values into Cmpi */
1054   /*----------------------------------------------------*/
1055   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1056   for (k=0; k<merge->nrecv; k++) {
1057     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1058     nrows       = *(buf_ri_k[k]);
1059     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1060     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1061   }
1062 
1063   for (i=0; i<cm; i++) {
1064     row  = owners[rank] + i; /* global row index of C_seq */
1065     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1066     ba_i = ba + bi[i];
1067     bnz  = bi[i+1] - bi[i];
1068     /* add received vals into ba */
1069     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1070       /* i-th row */
1071       if (i == *nextrow[k]) {
1072         cnz    = *(nextci[k]+1) - *nextci[k];
1073         cj     = buf_rj[k] + *(nextci[k]);
1074         ca     = abuf_r[k] + *(nextci[k]);
1075         nextcj = 0;
1076         for (j=0; nextcj<cnz; j++) {
1077           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1078             ba_i[j] += ca[nextcj++];
1079           }
1080         }
1081         nextrow[k]++; nextci[k]++;
1082         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1083       }
1084     }
1085     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1086   }
1087   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1088   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1089 
1090   ierr = PetscFree(ba);CHKERRQ(ierr);
1091   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1092   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1093   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1094   ierr = PetscFree(aval);CHKERRQ(ierr);
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1099 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1100 #undef __FUNCT__
1101 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1102 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1103 {
1104   PetscErrorCode      ierr;
1105   Mat                 Cmpi,A_loc,POt,PDt;
1106   Mat_PtAPMPI         *ptap;
1107   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1108   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1109   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1110   PetscInt            nnz;
1111   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1112   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1113   PetscBT             lnkbt;
1114   MPI_Comm            comm;
1115   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1116   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1117   PetscInt            len,proc,*dnz,*onz,*owners;
1118   PetscInt            nzi,*bi,*bj;
1119   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1120   MPI_Request         *swaits,*rwaits;
1121   MPI_Status          *sstatus,rstatus;
1122   Mat_Merge_SeqsToMPI *merge;
1123   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1124   PetscReal           afill  =1.0,afill_tmp;
1125   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1126   PetscScalar         *vals;
1127   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1128 
1129   PetscFunctionBegin;
1130   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1131   /* check if matrix local sizes are compatible */
1132   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1133 
1134   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1135   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1136 
1137   /* create struct Mat_PtAPMPI and attached it to C later */
1138   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1139 
1140   /* get A_loc by taking all local rows of A */
1141   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1142 
1143   ptap->A_loc = A_loc;
1144 
1145   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1146   ai    = a_loc->i;
1147   aj    = a_loc->j;
1148 
1149   /* determine symbolic Co=(p->B)^T*A - send to others */
1150   /*----------------------------------------------------*/
1151   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1152   pdt  = (Mat_SeqAIJ*)PDt->data;
1153   pdti = pdt->i; pdtj = pdt->j;
1154 
1155   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1156   pot  = (Mat_SeqAIJ*)POt->data;
1157   poti = pot->i; potj = pot->j;
1158 
1159   /* then, compute symbolic Co = (p->B)^T*A */
1160   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1161   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1162   coi[0] = 0;
1163 
1164   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1165   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1166   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1167   current_space = free_space;
1168 
1169   /* create and initialize a linked list */
1170   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1171 
1172   for (i=0; i<pon; i++) {
1173     pnz = poti[i+1] - poti[i];
1174     ptJ = potj + poti[i];
1175     for (j=0; j<pnz; j++) {
1176       row  = ptJ[j]; /* row of A_loc == col of Pot */
1177       anz  = ai[row+1] - ai[row];
1178       Jptr = aj + ai[row];
1179       /* add non-zero cols of AP into the sorted linked list lnk */
1180       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1181     }
1182     nnz = lnk[0];
1183 
1184     /* If free space is not available, double the total space in the list */
1185     if (current_space->local_remaining<nnz) {
1186       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1187       nspacedouble++;
1188     }
1189 
1190     /* Copy data into free space, and zero out denserows */
1191     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1192 
1193     current_space->array           += nnz;
1194     current_space->local_used      += nnz;
1195     current_space->local_remaining -= nnz;
1196 
1197     coi[i+1] = coi[i] + nnz;
1198   }
1199 
1200   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1201   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1202 
1203   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1204   if (afill_tmp > afill) afill = afill_tmp;
1205 
1206   /* send j-array (coj) of Co to other processors */
1207   /*----------------------------------------------*/
1208   /* determine row ownership */
1209   ierr = PetscNew(&merge);CHKERRQ(ierr);
1210   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1211 
1212   merge->rowmap->n  = pn;
1213   merge->rowmap->bs = 1;
1214 
1215   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1216   owners = merge->rowmap->range;
1217 
1218   /* determine the number of messages to send, their lengths */
1219   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1220   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1221 
1222   len_s        = merge->len_s;
1223   merge->nsend = 0;
1224 
1225   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1226   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1227 
1228   proc = 0;
1229   for (i=0; i<pon; i++) {
1230     while (prmap[i] >= owners[proc+1]) proc++;
1231     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1232     len_s[proc] += coi[i+1] - coi[i];
1233   }
1234 
1235   len          = 0; /* max length of buf_si[] */
1236   owners_co[0] = 0;
1237   for (proc=0; proc<size; proc++) {
1238     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1239     if (len_si[proc]) {
1240       merge->nsend++;
1241       len_si[proc] = 2*(len_si[proc] + 1);
1242       len         += len_si[proc];
1243     }
1244   }
1245 
1246   /* determine the number and length of messages to receive for coi and coj  */
1247   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1248   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1249 
1250   /* post the Irecv and Isend of coj */
1251   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1252   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1253   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1254   for (proc=0, k=0; proc<size; proc++) {
1255     if (!len_s[proc]) continue;
1256     i    = owners_co[proc];
1257     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1258     k++;
1259   }
1260 
1261   /* receives and sends of coj are complete */
1262   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1263   for (i=0; i<merge->nrecv; i++) {
1264     PetscMPIInt icompleted;
1265     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1266   }
1267   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1268   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1269 
1270   /* send and recv coi */
1271   /*-------------------*/
1272   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1273   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1274   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1275   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1276   for (proc=0,k=0; proc<size; proc++) {
1277     if (!len_s[proc]) continue;
1278     /* form outgoing message for i-structure:
1279          buf_si[0]:                 nrows to be sent
1280                [1:nrows]:           row index (global)
1281                [nrows+1:2*nrows+1]: i-structure index
1282     */
1283     /*-------------------------------------------*/
1284     nrows       = len_si[proc]/2 - 1;
1285     buf_si_i    = buf_si + nrows+1;
1286     buf_si[0]   = nrows;
1287     buf_si_i[0] = 0;
1288     nrows       = 0;
1289     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1290       nzi               = coi[i+1] - coi[i];
1291       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1292       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1293       nrows++;
1294     }
1295     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1296     k++;
1297     buf_si += len_si[proc];
1298   }
1299   i = merge->nrecv;
1300   while (i--) {
1301     PetscMPIInt icompleted;
1302     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1303   }
1304   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1305   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1306   ierr = PetscFree(len_si);CHKERRQ(ierr);
1307   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1308   ierr = PetscFree(swaits);CHKERRQ(ierr);
1309   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1310   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1311 
1312   /* compute the local portion of C (mpi mat) */
1313   /*------------------------------------------*/
1314   /* allocate bi array and free space for accumulating nonzero column info */
1315   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1316   bi[0] = 0;
1317 
1318   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1319   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1320   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1321   current_space = free_space;
1322 
1323   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1324   for (k=0; k<merge->nrecv; k++) {
1325     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1326     nrows       = *buf_ri_k[k];
1327     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1328     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1329   }
1330 
1331   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1332   rmax = 0;
1333   for (i=0; i<pn; i++) {
1334     /* add pdt[i,:]*AP into lnk */
1335     pnz = pdti[i+1] - pdti[i];
1336     ptJ = pdtj + pdti[i];
1337     for (j=0; j<pnz; j++) {
1338       row  = ptJ[j];  /* row of AP == col of Pt */
1339       anz  = ai[row+1] - ai[row];
1340       Jptr = aj + ai[row];
1341       /* add non-zero cols of AP into the sorted linked list lnk */
1342       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1343     }
1344 
1345     /* add received col data into lnk */
1346     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1347       if (i == *nextrow[k]) { /* i-th row */
1348         nzi  = *(nextci[k]+1) - *nextci[k];
1349         Jptr = buf_rj[k] + *nextci[k];
1350         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1351         nextrow[k]++; nextci[k]++;
1352       }
1353     }
1354     nnz = lnk[0];
1355 
1356     /* if free space is not available, make more free space */
1357     if (current_space->local_remaining<nnz) {
1358       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1359       nspacedouble++;
1360     }
1361     /* copy data into free space, then initialize lnk */
1362     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1363     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1364 
1365     current_space->array           += nnz;
1366     current_space->local_used      += nnz;
1367     current_space->local_remaining -= nnz;
1368 
1369     bi[i+1] = bi[i] + nnz;
1370     if (nnz > rmax) rmax = nnz;
1371   }
1372   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1373 
1374   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1375   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1376 
1377   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1378   if (afill_tmp > afill) afill = afill_tmp;
1379   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1380   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1381   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1382 
1383   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1384   /*----------------------------------------------------------------------------------*/
1385   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1386 
1387   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1388   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1389   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1390   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1391   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1392   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1393   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1394   for (i=0; i<pn; i++) {
1395     row  = i + rstart;
1396     nnz  = bi[i+1] - bi[i];
1397     Jptr = bj + bi[i];
1398     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1399   }
1400   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1401   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1402   ierr = PetscFree(vals);CHKERRQ(ierr);
1403 
1404   merge->bi        = bi;
1405   merge->bj        = bj;
1406   merge->coi       = coi;
1407   merge->coj       = coj;
1408   merge->buf_ri    = buf_ri;
1409   merge->buf_rj    = buf_rj;
1410   merge->owners_co = owners_co;
1411 
1412   /* attach the supporting struct to Cmpi for reuse */
1413   c           = (Mat_MPIAIJ*)Cmpi->data;
1414   c->ptap     = ptap;
1415   ptap->api   = NULL;
1416   ptap->apj   = NULL;
1417   ptap->merge = merge;
1418   ptap->destroy   = Cmpi->ops->destroy;
1419   ptap->duplicate = Cmpi->ops->duplicate;
1420 
1421   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1422   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1423   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1424 
1425   *C = Cmpi;
1426 #if defined(PETSC_USE_INFO)
1427   if (bi[pn] != 0) {
1428     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1429     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1430   } else {
1431     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1432   }
1433 #endif
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 #undef __FUNCT__
1438 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1439 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1440 {
1441   PetscErrorCode      ierr;
1442   Mat_Merge_SeqsToMPI *merge;
1443   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1444   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1445   Mat_PtAPMPI         *ptap;
1446   PetscInt            *adj;
1447   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1448   MatScalar           *ada,*ca,valtmp;
1449   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1450   MPI_Comm            comm;
1451   PetscMPIInt         size,rank,taga,*len_s;
1452   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1453   PetscInt            **buf_ri,**buf_rj;
1454   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1455   MPI_Request         *s_waits,*r_waits;
1456   MPI_Status          *status;
1457   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1458   PetscInt            *ai,*aj,*coi,*coj;
1459   PetscInt            *poJ,*pdJ;
1460   Mat                 A_loc;
1461   Mat_SeqAIJ          *a_loc;
1462 
1463   PetscFunctionBegin;
1464   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1465   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1466   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1467 
1468   ptap  = c->ptap;
1469   merge = ptap->merge;
1470 
1471   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1472   /*------------------------------------------*/
1473   /* get data from symbolic products */
1474   coi    = merge->coi; coj = merge->coj;
1475   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1476   bi     = merge->bi; bj = merge->bj;
1477   owners = merge->rowmap->range;
1478   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1479 
1480   /* get A_loc by taking all local rows of A */
1481   A_loc = ptap->A_loc;
1482   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1483   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1484   ai    = a_loc->i;
1485   aj    = a_loc->j;
1486 
1487   for (i=0; i<am; i++) {
1488     anz = ai[i+1] - ai[i];
1489     adj = aj + ai[i];
1490     ada = a_loc->a + ai[i];
1491 
1492     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1493     /*-------------------------------------------------------------*/
1494     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1495     pnz = po->i[i+1] - po->i[i];
1496     poJ = po->j + po->i[i];
1497     pA  = po->a + po->i[i];
1498     for (j=0; j<pnz; j++) {
1499       row = poJ[j];
1500       cj  = coj + coi[row];
1501       ca  = coa + coi[row];
1502       /* perform sparse axpy */
1503       nexta  = 0;
1504       valtmp = pA[j];
1505       for (k=0; nexta<anz; k++) {
1506         if (cj[k] == adj[nexta]) {
1507           ca[k] += valtmp*ada[nexta];
1508           nexta++;
1509         }
1510       }
1511       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1512     }
1513 
1514     /* put the value into Cd (diagonal part) */
1515     pnz = pd->i[i+1] - pd->i[i];
1516     pdJ = pd->j + pd->i[i];
1517     pA  = pd->a + pd->i[i];
1518     for (j=0; j<pnz; j++) {
1519       row = pdJ[j];
1520       cj  = bj + bi[row];
1521       ca  = ba + bi[row];
1522       /* perform sparse axpy */
1523       nexta  = 0;
1524       valtmp = pA[j];
1525       for (k=0; nexta<anz; k++) {
1526         if (cj[k] == adj[nexta]) {
1527           ca[k] += valtmp*ada[nexta];
1528           nexta++;
1529         }
1530       }
1531       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1532     }
1533   }
1534 
1535   /* 3) send and recv matrix values coa */
1536   /*------------------------------------*/
1537   buf_ri = merge->buf_ri;
1538   buf_rj = merge->buf_rj;
1539   len_s  = merge->len_s;
1540   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1541   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1542 
1543   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1544   for (proc=0,k=0; proc<size; proc++) {
1545     if (!len_s[proc]) continue;
1546     i    = merge->owners_co[proc];
1547     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1548     k++;
1549   }
1550   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1551   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1552 
1553   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1554   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1555   ierr = PetscFree(coa);CHKERRQ(ierr);
1556 
1557   /* 4) insert local Cseq and received values into Cmpi */
1558   /*----------------------------------------------------*/
1559   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1560   for (k=0; k<merge->nrecv; k++) {
1561     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1562     nrows       = *(buf_ri_k[k]);
1563     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1564     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1565   }
1566 
1567   for (i=0; i<cm; i++) {
1568     row  = owners[rank] + i; /* global row index of C_seq */
1569     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1570     ba_i = ba + bi[i];
1571     bnz  = bi[i+1] - bi[i];
1572     /* add received vals into ba */
1573     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1574       /* i-th row */
1575       if (i == *nextrow[k]) {
1576         cnz    = *(nextci[k]+1) - *nextci[k];
1577         cj     = buf_rj[k] + *(nextci[k]);
1578         ca     = abuf_r[k] + *(nextci[k]);
1579         nextcj = 0;
1580         for (j=0; nextcj<cnz; j++) {
1581           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1582             ba_i[j] += ca[nextcj++];
1583           }
1584         }
1585         nextrow[k]++; nextci[k]++;
1586         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1587       }
1588     }
1589     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1590   }
1591   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1592   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1593 
1594   ierr = PetscFree(ba);CHKERRQ(ierr);
1595   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1596   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1597   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1602 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1603    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1604 #undef __FUNCT__
1605 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1606 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1607 {
1608   PetscErrorCode      ierr;
1609   Mat                 Cmpi,A_loc,POt,PDt;
1610   Mat_PtAPMPI         *ptap;
1611   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1612   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*a=(Mat_MPIAIJ*)A->data,*c;
1613   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1614   PetscInt            nnz;
1615   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1616   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1617   MPI_Comm            comm;
1618   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1619   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1620   PetscInt            len,proc,*dnz,*onz,*owners;
1621   PetscInt            nzi,*bi,*bj;
1622   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1623   MPI_Request         *swaits,*rwaits;
1624   MPI_Status          *sstatus,rstatus;
1625   Mat_Merge_SeqsToMPI *merge;
1626   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1627   PetscReal           afill  =1.0,afill_tmp;
1628   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1629   PetscScalar         *vals;
1630   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1631   PetscTable          ta;
1632 
1633   PetscFunctionBegin;
1634   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1635   /* check if matrix local sizes are compatible */
1636   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1637 
1638   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1639   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1640 
1641   /* create struct Mat_PtAPMPI and attached it to C later */
1642   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1643 
1644   /* get A_loc by taking all local rows of A */
1645   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1646 
1647   ptap->A_loc = A_loc;
1648   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1649   ai          = a_loc->i;
1650   aj          = a_loc->j;
1651 
1652   /* determine symbolic Co=(p->B)^T*A - send to others */
1653   /*----------------------------------------------------*/
1654   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1655   pdt  = (Mat_SeqAIJ*)PDt->data;
1656   pdti = pdt->i; pdtj = pdt->j;
1657 
1658   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1659   pot  = (Mat_SeqAIJ*)POt->data;
1660   poti = pot->i; potj = pot->j;
1661 
1662   /* then, compute symbolic Co = (p->B)^T*A */
1663   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1664                          >= (num of nonzero rows of C_seq) - pn */
1665   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1666   coi[0] = 0;
1667 
1668   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1669   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1670   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1671   current_space = free_space;
1672 
1673   /* create and initialize a linked list */
1674   ierr = PetscTableCreate(A->cmap->n + a->B->cmap->N,aN,&ta);CHKERRQ(ierr);
1675   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1676   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1677 
1678   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1679 
1680   for (i=0; i<pon; i++) {
1681     pnz = poti[i+1] - poti[i];
1682     ptJ = potj + poti[i];
1683     for (j=0; j<pnz; j++) {
1684       row  = ptJ[j]; /* row of A_loc == col of Pot */
1685       anz  = ai[row+1] - ai[row];
1686       Jptr = aj + ai[row];
1687       /* add non-zero cols of AP into the sorted linked list lnk */
1688       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1689     }
1690     nnz = lnk[0];
1691 
1692     /* If free space is not available, double the total space in the list */
1693     if (current_space->local_remaining<nnz) {
1694       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1695       nspacedouble++;
1696     }
1697 
1698     /* Copy data into free space, and zero out denserows */
1699     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1700 
1701     current_space->array           += nnz;
1702     current_space->local_used      += nnz;
1703     current_space->local_remaining -= nnz;
1704 
1705     coi[i+1] = coi[i] + nnz;
1706   }
1707 
1708   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1709   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1710   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1711 
1712   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1713   if (afill_tmp > afill) afill = afill_tmp;
1714 
1715   /* send j-array (coj) of Co to other processors */
1716   /*----------------------------------------------*/
1717   /* determine row ownership */
1718   ierr = PetscNew(&merge);CHKERRQ(ierr);
1719   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1720 
1721   merge->rowmap->n  = pn;
1722   merge->rowmap->bs = 1;
1723 
1724   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1725   owners = merge->rowmap->range;
1726 
1727   /* determine the number of messages to send, their lengths */
1728   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1729   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1730 
1731   len_s        = merge->len_s;
1732   merge->nsend = 0;
1733 
1734   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1735   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1736 
1737   proc = 0;
1738   for (i=0; i<pon; i++) {
1739     while (prmap[i] >= owners[proc+1]) proc++;
1740     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1741     len_s[proc] += coi[i+1] - coi[i];
1742   }
1743 
1744   len          = 0; /* max length of buf_si[] */
1745   owners_co[0] = 0;
1746   for (proc=0; proc<size; proc++) {
1747     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1748     if (len_si[proc]) {
1749       merge->nsend++;
1750       len_si[proc] = 2*(len_si[proc] + 1);
1751       len         += len_si[proc];
1752     }
1753   }
1754 
1755   /* determine the number and length of messages to receive for coi and coj  */
1756   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1757   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1758 
1759   /* post the Irecv and Isend of coj */
1760   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1761   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1762   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1763   for (proc=0, k=0; proc<size; proc++) {
1764     if (!len_s[proc]) continue;
1765     i    = owners_co[proc];
1766     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1767     k++;
1768   }
1769 
1770   /* receives and sends of coj are complete */
1771   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1772   for (i=0; i<merge->nrecv; i++) {
1773     PetscMPIInt icompleted;
1774     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1775   }
1776   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1777   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1778 
1779   /* add received column indices into table to update Armax */
1780   /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/examples/tutorials/ex56.c! */
1781   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1782     Jptr = buf_rj[k];
1783     for (j=0; j<merge->len_r[k]; j++) {
1784       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1785     }
1786   }
1787   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1788   /* printf("Armax %d, an %d + Bn %d = %d, aN %d\n",Armax,A->cmap->n,a->B->cmap->N,A->cmap->n+a->B->cmap->N,aN); */
1789 
1790   /* send and recv coi */
1791   /*-------------------*/
1792   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1793   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1794   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1795   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1796   for (proc=0,k=0; proc<size; proc++) {
1797     if (!len_s[proc]) continue;
1798     /* form outgoing message for i-structure:
1799          buf_si[0]:                 nrows to be sent
1800                [1:nrows]:           row index (global)
1801                [nrows+1:2*nrows+1]: i-structure index
1802     */
1803     /*-------------------------------------------*/
1804     nrows       = len_si[proc]/2 - 1;
1805     buf_si_i    = buf_si + nrows+1;
1806     buf_si[0]   = nrows;
1807     buf_si_i[0] = 0;
1808     nrows       = 0;
1809     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1810       nzi               = coi[i+1] - coi[i];
1811       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1812       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1813       nrows++;
1814     }
1815     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1816     k++;
1817     buf_si += len_si[proc];
1818   }
1819   i = merge->nrecv;
1820   while (i--) {
1821     PetscMPIInt icompleted;
1822     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1823   }
1824   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1825   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1826   ierr = PetscFree(len_si);CHKERRQ(ierr);
1827   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1828   ierr = PetscFree(swaits);CHKERRQ(ierr);
1829   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1830   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1831 
1832   /* compute the local portion of C (mpi mat) */
1833   /*------------------------------------------*/
1834   /* allocate bi array and free space for accumulating nonzero column info */
1835   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1836   bi[0] = 0;
1837 
1838   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1839   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1840   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1841   current_space = free_space;
1842 
1843   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1844   for (k=0; k<merge->nrecv; k++) {
1845     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1846     nrows       = *buf_ri_k[k];
1847     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1848     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1849   }
1850 
1851   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1852   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1853   rmax = 0;
1854   for (i=0; i<pn; i++) {
1855     /* add pdt[i,:]*AP into lnk */
1856     pnz = pdti[i+1] - pdti[i];
1857     ptJ = pdtj + pdti[i];
1858     for (j=0; j<pnz; j++) {
1859       row  = ptJ[j];  /* row of AP == col of Pt */
1860       anz  = ai[row+1] - ai[row];
1861       Jptr = aj + ai[row];
1862       /* add non-zero cols of AP into the sorted linked list lnk */
1863       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1864     }
1865 
1866     /* add received col data into lnk */
1867     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1868       if (i == *nextrow[k]) { /* i-th row */
1869         nzi  = *(nextci[k]+1) - *nextci[k];
1870         Jptr = buf_rj[k] + *nextci[k];
1871         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1872         nextrow[k]++; nextci[k]++;
1873       }
1874     }
1875     nnz = lnk[0];
1876 
1877     /* if free space is not available, make more free space */
1878     if (current_space->local_remaining<nnz) {
1879       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1880       nspacedouble++;
1881     }
1882     /* copy data into free space, then initialize lnk */
1883     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1884     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1885 
1886     current_space->array           += nnz;
1887     current_space->local_used      += nnz;
1888     current_space->local_remaining -= nnz;
1889 
1890     bi[i+1] = bi[i] + nnz;
1891     if (nnz > rmax) rmax = nnz;
1892   }
1893   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1894 
1895   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1896   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1897   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1898   if (afill_tmp > afill) afill = afill_tmp;
1899   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1900   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1901 
1902   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1903   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1904 
1905   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1906   /*----------------------------------------------------------------------------------*/
1907   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1908 
1909   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1910   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1911   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1912   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1913   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1914   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1915   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1916   for (i=0; i<pn; i++) {
1917     row  = i + rstart;
1918     nnz  = bi[i+1] - bi[i];
1919     Jptr = bj + bi[i];
1920     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1921   }
1922   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1924   ierr = PetscFree(vals);CHKERRQ(ierr);
1925 
1926   merge->bi        = bi;
1927   merge->bj        = bj;
1928   merge->coi       = coi;
1929   merge->coj       = coj;
1930   merge->buf_ri    = buf_ri;
1931   merge->buf_rj    = buf_rj;
1932   merge->owners_co = owners_co;
1933 
1934   /* attach the supporting struct to Cmpi for reuse */
1935   c = (Mat_MPIAIJ*)Cmpi->data;
1936 
1937   c->ptap     = ptap;
1938   ptap->api   = NULL;
1939   ptap->apj   = NULL;
1940   ptap->merge = merge;
1941   ptap->apa   = NULL;
1942   ptap->destroy   = Cmpi->ops->destroy;
1943   ptap->duplicate = Cmpi->ops->duplicate;
1944 
1945   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1946   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1947   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1948 
1949   *C = Cmpi;
1950 #if defined(PETSC_USE_INFO)
1951   if (bi[pn] != 0) {
1952     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1953     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1954   } else {
1955     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1956   }
1957 #endif
1958   PetscFunctionReturn(0);
1959 }
1960