xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 4fc747eaadbeca11629f314a99edccbc2ed7b3d3) !
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20   MPI_Comm       comm;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
25     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
26 
27     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
28     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsEnd();CHKERRQ(ierr);
30 
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     switch (alg) {
33     case 1:
34       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
35       break;
36     default:
37       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
38       break;
39     }
40     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
41   }
42   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
43   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
44   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 
48 #undef __FUNCT__
49 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
50 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51 {
52   PetscErrorCode ierr;
53   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
54   Mat_PtAPMPI    *ptap = a->ptap;
55 
56   PetscFunctionBegin;
57   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
59   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
60   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
61   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
62   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
63   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
64   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
65   ierr = ptap->destroy(A);CHKERRQ(ierr);
66   ierr = PetscFree(ptap);CHKERRQ(ierr);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
72 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73 {
74   PetscErrorCode ierr;
75   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
76   Mat_PtAPMPI    *ptap = a->ptap;
77 
78   PetscFunctionBegin;
79   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
80 
81   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83   PetscFunctionReturn(0);
84 }
85 
86 #undef __FUNCT__
87 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
88 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89 {
90   PetscErrorCode ierr;
91   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94   PetscScalar    *cda=cd->a,*coa=co->a;
95   Mat_SeqAIJ     *p_loc,*p_oth;
96   PetscScalar    *apa,*ca;
97   PetscInt       cm   =C->rmap->n;
98   Mat_PtAPMPI    *ptap=c->ptap;
99   PetscInt       *api,*apj,*apJ,i,k;
100   PetscInt       cstart=C->cmap->rstart;
101   PetscInt       cdnz,conz,k0,k1;
102   MPI_Comm       comm;
103   PetscMPIInt    size;
104 
105   PetscFunctionBegin;
106   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
107   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
108 
109   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
110   /*-----------------------------------------------------*/
111   /* update numerical values of P_oth and P_loc */
112   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
113   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
114 
115   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116   /*----------------------------------------------------------*/
117   /* get data from symbolic products */
118   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119   p_oth = NULL;
120   if (size >1) {
121     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122   }
123 
124   /* get apa for storing dense row A[i,:]*P */
125   apa = ptap->apa;
126 
127   api = ptap->api;
128   apj = ptap->apj;
129   for (i=0; i<cm; i++) {
130     /* compute apa = A[i,:]*P */
131     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
132 
133     /* set values in C */
134     apJ  = apj + api[i];
135     cdnz = cd->i[i+1] - cd->i[i];
136     conz = co->i[i+1] - co->i[i];
137 
138     /* 1st off-diagoanl part of C */
139     ca = coa + co->i[i];
140     k  = 0;
141     for (k0=0; k0<conz; k0++) {
142       if (apJ[k] >= cstart) break;
143       ca[k0]      = apa[apJ[k]];
144       apa[apJ[k]] = 0.0;
145       k++;
146     }
147 
148     /* diagonal part of C */
149     ca = cda + cd->i[i];
150     for (k1=0; k1<cdnz; k1++) {
151       ca[k1]      = apa[apJ[k]];
152       apa[apJ[k]] = 0.0;
153       k++;
154     }
155 
156     /* 2nd off-diagoanl part of C */
157     ca = coa + co->i[i];
158     for (; k0<conz; k0++) {
159       ca[k0]      = apa[apJ[k]];
160       apa[apJ[k]] = 0.0;
161       k++;
162     }
163   }
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
171 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172 {
173   PetscErrorCode     ierr;
174   MPI_Comm           comm;
175   PetscMPIInt        size;
176   Mat                Cmpi;
177   Mat_PtAPMPI        *ptap;
178   PetscFreeSpaceList free_space=NULL,current_space=NULL;
179   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
180   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185   PetscBT            lnkbt;
186   PetscScalar        *apa;
187   PetscReal          afill;
188   PetscTable         ta;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
192   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
193 
194   /* create struct Mat_PtAPMPI and attached it to C later */
195   ierr = PetscNew(&ptap);CHKERRQ(ierr);
196 
197   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
199 
200   /* get P_loc by taking all local rows of P */
201   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
202 
203   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204   pi_loc = p_loc->i; pj_loc = p_loc->j;
205   if (size > 1) {
206     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207     pi_oth = p_oth->i; pj_oth = p_oth->j;
208   } else {
209     p_oth = NULL;
210     pi_oth = NULL; pj_oth = NULL;
211   }
212 
213   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214   /*-------------------------------------------------------------------*/
215   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
216   ptap->api = api;
217   api[0]    = 0;
218 
219   /* create and initialize a linked list */
220   ierr = PetscTableCreate(pN,pN,&ta);CHKERRQ(ierr);
221   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
222   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
223   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
224   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
225 
226   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
227 
228   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
229   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
230   current_space = free_space;
231 
232   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
233   for (i=0; i<am; i++) {
234     /* diagonal portion of A */
235     nzi = adi[i+1] - adi[i];
236     for (j=0; j<nzi; j++) {
237       row  = *adj++;
238       pnz  = pi_loc[row+1] - pi_loc[row];
239       Jptr = pj_loc + pi_loc[row];
240       /* add non-zero cols of P into the sorted linked list lnk */
241       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
242     }
243     /* off-diagonal portion of A */
244     nzi = aoi[i+1] - aoi[i];
245     for (j=0; j<nzi; j++) {
246       row  = *aoj++;
247       pnz  = pi_oth[row+1] - pi_oth[row];
248       Jptr = pj_oth + pi_oth[row];
249       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
250     }
251 
252     apnz     = lnk[0];
253     api[i+1] = api[i] + apnz;
254 
255     /* if free space is not available, double the total space in the list */
256     if (current_space->local_remaining<apnz) {
257       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
258       nspacedouble++;
259     }
260 
261     /* Copy data into free space, then initialize lnk */
262     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
263     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
264 
265     current_space->array           += apnz;
266     current_space->local_used      += apnz;
267     current_space->local_remaining -= apnz;
268   }
269 
270   /* Allocate space for apj, initialize apj, and */
271   /* destroy list of free space and other temporary array(s) */
272   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
273   apj  = ptap->apj;
274   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
275   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
276 
277   /* malloc apa to store dense row A[i,:]*P */
278   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
279 
280   ptap->apa = apa;
281 
282   /* create and assemble symbolic parallel matrix Cmpi */
283   /*----------------------------------------------------*/
284   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
285   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
286   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
287 
288   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
289   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
290   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
291   for (i=0; i<am; i++) {
292     row  = i + rstart;
293     apnz = api[i+1] - api[i];
294     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
295     apj += apnz;
296   }
297   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
298   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
299 
300   ptap->destroy        = Cmpi->ops->destroy;
301   ptap->duplicate      = Cmpi->ops->duplicate;
302   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
303   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
304   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
305 
306   /* attach the supporting struct to Cmpi for reuse */
307   c       = (Mat_MPIAIJ*)Cmpi->data;
308   c->ptap = ptap;
309 
310   *C = Cmpi;
311 
312   /* set MatInfo */
313   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
314   if (afill < 1.0) afill = 1.0;
315   Cmpi->info.mallocs           = nspacedouble;
316   Cmpi->info.fill_ratio_given  = fill;
317   Cmpi->info.fill_ratio_needed = afill;
318 
319 #if defined(PETSC_USE_INFO)
320   if (api[am]) {
321     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
322     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
323   } else {
324     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
325   }
326 #endif
327   PetscFunctionReturn(0);
328 }
329 
330 #undef __FUNCT__
331 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
332 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
333 {
334   PetscErrorCode ierr;
335 
336   PetscFunctionBegin;
337   if (scall == MAT_INITIAL_MATRIX) {
338     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
339     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
340     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
341   }
342   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
343   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
344   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 typedef struct {
349   Mat         workB;
350   PetscScalar *rvalues,*svalues;
351   MPI_Request *rwaits,*swaits;
352 } MPIAIJ_MPIDense;
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
356 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
357 {
358   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
359   PetscErrorCode  ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
363   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
364   ierr = PetscFree(contents);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 
368 #undef __FUNCT__
369 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
370 /*
371     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
372   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
373 
374   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
375 */
376 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
377 {
378   PetscErrorCode         ierr;
379   PetscBool              flg;
380   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
381   PetscInt               nz   = aij->B->cmap->n;
382   PetscContainer         container;
383   MPIAIJ_MPIDense        *contents;
384   VecScatter             ctx   = aij->Mvctx;
385   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
386   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
387 
388   PetscFunctionBegin;
389   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
390   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
391 
392   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
393   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
394   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
395 
396   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
397 
398   ierr = PetscNew(&contents);CHKERRQ(ierr);
399   /* Create work matrix used to store off processor rows of B needed for local product */
400   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
401   /* Create work arrays needed */
402   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
403                       B->cmap->N*to->starts[to->n],&contents->svalues,
404                       from->n,&contents->rwaits,
405                       to->n,&contents->swaits);CHKERRQ(ierr);
406 
407   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
408   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
409   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
410   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
411   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
412 
413   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
417 #undef __FUNCT__
418 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
419 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
420 {
421   PetscErrorCode         ierr;
422   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
423   PetscInt               nz   = aij->B->cmap->n;
424   PetscContainer         container;
425   MPIAIJ_MPIDense        *contents;
426   VecScatter             ctx   = aij->Mvctx;
427   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
428   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
429   PetscInt               m     = A->rmap->n,n=B->cmap->n;
430 
431   PetscFunctionBegin;
432   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
433   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
434   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
435   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
436   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
437   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
438   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
439 
440   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
441 
442   ierr = PetscNew(&contents);CHKERRQ(ierr);
443   /* Create work matrix used to store off processor rows of B needed for local product */
444   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
445   /* Create work arrays needed */
446   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
447                       B->cmap->N*to->starts[to->n],&contents->svalues,
448                       from->n,&contents->rwaits,
449                       to->n,&contents->swaits);CHKERRQ(ierr);
450 
451   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
452   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
453   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
454   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
455   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNCT__
460 #define __FUNCT__ "MatMPIDenseScatter"
461 /*
462     Performs an efficient scatter on the rows of B needed by this process; this is
463     a modification of the VecScatterBegin_() routines.
464 */
465 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
466 {
467   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
468   PetscErrorCode         ierr;
469   PetscScalar            *b,*w,*svalues,*rvalues;
470   VecScatter             ctx   = aij->Mvctx;
471   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
472   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
473   PetscInt               i,j,k;
474   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
475   PetscMPIInt            *sprocs,*rprocs,nrecvs;
476   MPI_Request            *swaits,*rwaits;
477   MPI_Comm               comm;
478   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
479   MPI_Status             status;
480   MPIAIJ_MPIDense        *contents;
481   PetscContainer         container;
482   Mat                    workB;
483 
484   PetscFunctionBegin;
485   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
486   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
487   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
488   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
489 
490   workB = *outworkB = contents->workB;
491   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
492   sindices = to->indices;
493   sstarts  = to->starts;
494   sprocs   = to->procs;
495   swaits   = contents->swaits;
496   svalues  = contents->svalues;
497 
498   rindices = from->indices;
499   rstarts  = from->starts;
500   rprocs   = from->procs;
501   rwaits   = contents->rwaits;
502   rvalues  = contents->rvalues;
503 
504   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
505   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
506 
507   for (i=0; i<from->n; i++) {
508     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
509   }
510 
511   for (i=0; i<to->n; i++) {
512     /* pack a message at a time */
513     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
514       for (k=0; k<ncols; k++) {
515         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
516       }
517     }
518     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
519   }
520 
521   nrecvs = from->n;
522   while (nrecvs) {
523     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
524     nrecvs--;
525     /* unpack a message at a time */
526     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
527       for (k=0; k<ncols; k++) {
528         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
529       }
530     }
531   }
532   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
533 
534   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
535   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
536   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
537   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
538   PetscFunctionReturn(0);
539 }
540 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
541 
542 #undef __FUNCT__
543 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
544 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
545 {
546   PetscErrorCode ierr;
547   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
548   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
549   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
550   Mat            workB;
551 
552   PetscFunctionBegin;
553   /* diagonal block of A times all local rows of B*/
554   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
555 
556   /* get off processor parts of B needed to complete the product */
557   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
558 
559   /* off-diagonal block of A times nonlocal rows of B */
560   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
561   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
562   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
563   PetscFunctionReturn(0);
564 }
565 
566 #undef __FUNCT__
567 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
568 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
569 {
570   PetscErrorCode ierr;
571   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
572   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
573   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
574   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
575   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
576   Mat_SeqAIJ     *p_loc,*p_oth;
577   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
578   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
579   PetscInt       cm          = C->rmap->n,anz,pnz;
580   Mat_PtAPMPI    *ptap       = c->ptap;
581   PetscScalar    *apa_sparse = ptap->apa;
582   PetscInt       *api,*apj,*apJ,i,j,k,row;
583   PetscInt       cstart = C->cmap->rstart;
584   PetscInt       cdnz,conz,k0,k1,nextp;
585   MPI_Comm       comm;
586   PetscMPIInt    size;
587 
588   PetscFunctionBegin;
589   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
590   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
591 
592   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
593   /*-----------------------------------------------------*/
594   /* update numerical values of P_oth and P_loc */
595   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
596   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
597 
598   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
599   /*----------------------------------------------------------*/
600   /* get data from symbolic products */
601   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
602   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
603   if (size >1) {
604     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
605     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
606   } else {
607     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
608   }
609 
610   api = ptap->api;
611   apj = ptap->apj;
612   for (i=0; i<cm; i++) {
613     apJ = apj + api[i];
614 
615     /* diagonal portion of A */
616     anz = adi[i+1] - adi[i];
617     adj = ad->j + adi[i];
618     ada = ad->a + adi[i];
619     for (j=0; j<anz; j++) {
620       row = adj[j];
621       pnz = pi_loc[row+1] - pi_loc[row];
622       pj  = pj_loc + pi_loc[row];
623       pa  = pa_loc + pi_loc[row];
624       /* perform sparse axpy */
625       valtmp = ada[j];
626       nextp  = 0;
627       for (k=0; nextp<pnz; k++) {
628         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
629           apa_sparse[k] += valtmp*pa[nextp++];
630         }
631       }
632       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
633     }
634 
635     /* off-diagonal portion of A */
636     anz = aoi[i+1] - aoi[i];
637     aoj = ao->j + aoi[i];
638     aoa = ao->a + aoi[i];
639     for (j=0; j<anz; j++) {
640       row = aoj[j];
641       pnz = pi_oth[row+1] - pi_oth[row];
642       pj  = pj_oth + pi_oth[row];
643       pa  = pa_oth + pi_oth[row];
644       /* perform sparse axpy */
645       valtmp = aoa[j];
646       nextp  = 0;
647       for (k=0; nextp<pnz; k++) {
648         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
649           apa_sparse[k] += valtmp*pa[nextp++];
650         }
651       }
652       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
653     }
654 
655     /* set values in C */
656     cdnz = cd->i[i+1] - cd->i[i];
657     conz = co->i[i+1] - co->i[i];
658 
659     /* 1st off-diagoanl part of C */
660     ca = coa + co->i[i];
661     k  = 0;
662     for (k0=0; k0<conz; k0++) {
663       if (apJ[k] >= cstart) break;
664       ca[k0]        = apa_sparse[k];
665       apa_sparse[k] = 0.0;
666       k++;
667     }
668 
669     /* diagonal part of C */
670     ca = cda + cd->i[i];
671     for (k1=0; k1<cdnz; k1++) {
672       ca[k1]        = apa_sparse[k];
673       apa_sparse[k] = 0.0;
674       k++;
675     }
676 
677     /* 2nd off-diagoanl part of C */
678     ca = coa + co->i[i];
679     for (; k0<conz; k0++) {
680       ca[k0]        = apa_sparse[k];
681       apa_sparse[k] = 0.0;
682       k++;
683     }
684   }
685   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
686   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
687   PetscFunctionReturn(0);
688 }
689 
690 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
691 #undef __FUNCT__
692 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
693 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
694 {
695   PetscErrorCode     ierr;
696   MPI_Comm           comm;
697   PetscMPIInt        size;
698   Mat                Cmpi;
699   Mat_PtAPMPI        *ptap;
700   PetscFreeSpaceList free_space = NULL,current_space=NULL;
701   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
702   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
703   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
704   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
705   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
706   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
707   PetscReal          afill;
708   PetscScalar        *apa;
709   PetscTable         ta;
710 
711   PetscFunctionBegin;
712   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
713   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
714 
715   /* create struct Mat_PtAPMPI and attached it to C later */
716   ierr = PetscNew(&ptap);CHKERRQ(ierr);
717 
718   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
719   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
720 
721   /* get P_loc by taking all local rows of P */
722   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
723 
724   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
725   pi_loc = p_loc->i; pj_loc = p_loc->j;
726   if (size > 1) {
727     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
728     pi_oth = p_oth->i; pj_oth = p_oth->j;
729   } else {
730     p_oth  = NULL;
731     pi_oth = NULL; pj_oth = NULL;
732   }
733 
734   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
735   /*-------------------------------------------------------------------*/
736   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
737   ptap->api = api;
738   api[0]    = 0;
739 
740   /* create and initialize a linked list */
741   apnz_max = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected apnz_max */
742   if (apnz_max > pN) apnz_max = pN;
743   ierr = PetscTableCreate(apnz_max,pN,&ta);CHKERRQ(ierr);
744 
745   /* Calculate apnz_max */
746   apnz_max = 0;
747   for (i=0; i<am; i++) {
748     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
749     /* diagonal portion of A */
750     nzi  = adi[i+1] - adi[i];
751     Jptr = adj+adi[i];  /* cols of A_diag */
752     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
753     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
754     if (apnz_max < apnz) apnz_max = apnz;
755 
756     /*  off-diagonal portion of A */
757     nzi = aoi[i+1] - aoi[i];
758     Jptr = aoj+aoi[i];  /* cols of A_off */
759     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
760     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
761     if (apnz_max < apnz) apnz_max = apnz;
762   }
763   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
764 
765   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
766 
767   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
768   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
769   current_space = free_space;
770   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
771   for (i=0; i<am; i++) {
772     /* diagonal portion of A */
773     nzi = adi[i+1] - adi[i];
774     for (j=0; j<nzi; j++) {
775       row  = *adj++;
776       pnz  = pi_loc[row+1] - pi_loc[row];
777       Jptr = pj_loc + pi_loc[row];
778       /* add non-zero cols of P into the sorted linked list lnk */
779       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
780     }
781     /* off-diagonal portion of A */
782     nzi = aoi[i+1] - aoi[i];
783     for (j=0; j<nzi; j++) {
784       row  = *aoj++;
785       pnz  = pi_oth[row+1] - pi_oth[row];
786       Jptr = pj_oth + pi_oth[row];
787       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
788     }
789 
790     apnz     = *lnk;
791     api[i+1] = api[i] + apnz;
792 
793     /* if free space is not available, double the total space in the list */
794     if (current_space->local_remaining<apnz) {
795       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
796       nspacedouble++;
797     }
798 
799     /* Copy data into free space, then initialize lnk */
800     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
801     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
802 
803     current_space->array           += apnz;
804     current_space->local_used      += apnz;
805     current_space->local_remaining -= apnz;
806   }
807 
808   /* Allocate space for apj, initialize apj, and */
809   /* destroy list of free space and other temporary array(s) */
810   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
811   apj  = ptap->apj;
812   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
813   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
814 
815   /* create and assemble symbolic parallel matrix Cmpi */
816   /*----------------------------------------------------*/
817   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
818   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
819   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
820   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
821   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
822   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
823 
824   /* malloc apa for assembly Cmpi */
825   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
826 
827   ptap->apa = apa;
828   for (i=0; i<am; i++) {
829     row  = i + rstart;
830     apnz = api[i+1] - api[i];
831     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
832     apj += apnz;
833   }
834   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
835   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
836 
837   ptap->destroy             = Cmpi->ops->destroy;
838   ptap->duplicate           = Cmpi->ops->duplicate;
839   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
840   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
841   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
842 
843   /* attach the supporting struct to Cmpi for reuse */
844   c       = (Mat_MPIAIJ*)Cmpi->data;
845   c->ptap = ptap;
846 
847   *C = Cmpi;
848 
849   /* set MatInfo */
850   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
851   if (afill < 1.0) afill = 1.0;
852   Cmpi->info.mallocs           = nspacedouble;
853   Cmpi->info.fill_ratio_given  = fill;
854   Cmpi->info.fill_ratio_needed = afill;
855 
856 #if defined(PETSC_USE_INFO)
857   if (api[am]) {
858     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
859     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
860   } else {
861     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
862   }
863 #endif
864   PetscFunctionReturn(0);
865 }
866 
867 /*-------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
870 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
871 {
872   PetscErrorCode ierr;
873   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
874   PetscInt       alg=0; /* set default algorithm */
875 
876   PetscFunctionBegin;
877   if (scall == MAT_INITIAL_MATRIX) {
878     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
879     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
880     ierr = PetscOptionsEnd();CHKERRQ(ierr);
881 
882     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
883     switch (alg) {
884     case 1:
885       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
886       break;
887     case 2:
888     {
889       Mat         Pt;
890       Mat_PtAPMPI *ptap;
891       Mat_MPIAIJ  *c;
892       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
893       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
894       c        = (Mat_MPIAIJ*)(*C)->data;
895       ptap     = c->ptap;
896       ptap->Pt = Pt;
897       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
898       PetscFunctionReturn(0);
899     }
900       break;
901     default:
902       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
903       break;
904     }
905     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
906   }
907   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
908   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
909   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
910   PetscFunctionReturn(0);
911 }
912 
913 /* This routine only works when scall=MAT_REUSE_MATRIX! */
914 #undef __FUNCT__
915 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
916 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
917 {
918   PetscErrorCode ierr;
919   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
920   Mat_PtAPMPI    *ptap= c->ptap;
921   Mat            Pt=ptap->Pt;
922 
923   PetscFunctionBegin;
924   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
925   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
926   PetscFunctionReturn(0);
927 }
928 
929 /* Non-scalable version, use dense axpy */
930 #undef __FUNCT__
931 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
932 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
933 {
934   PetscErrorCode      ierr;
935   Mat_Merge_SeqsToMPI *merge;
936   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
937   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
938   Mat_PtAPMPI         *ptap;
939   PetscInt            *adj,*aJ;
940   PetscInt            i,j,k,anz,pnz,row,*cj;
941   MatScalar           *ada,*aval,*ca,valtmp;
942   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
943   MPI_Comm            comm;
944   PetscMPIInt         size,rank,taga,*len_s;
945   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
946   PetscInt            **buf_ri,**buf_rj;
947   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
948   MPI_Request         *s_waits,*r_waits;
949   MPI_Status          *status;
950   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
951   PetscInt            *ai,*aj,*coi,*coj;
952   PetscInt            *poJ,*pdJ;
953   Mat                 A_loc;
954   Mat_SeqAIJ          *a_loc;
955 
956   PetscFunctionBegin;
957   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
958   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
959   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
960 
961   ptap  = c->ptap;
962   merge = ptap->merge;
963 
964   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
965   /*--------------------------------------------------------------*/
966   /* get data from symbolic products */
967   coi  = merge->coi; coj = merge->coj;
968   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
969 
970   bi     = merge->bi; bj = merge->bj;
971   owners = merge->rowmap->range;
972   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
973 
974   /* get A_loc by taking all local rows of A */
975   A_loc = ptap->A_loc;
976   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
977   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
978   ai    = a_loc->i;
979   aj    = a_loc->j;
980 
981   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
982 
983   for (i=0; i<am; i++) {
984     /* 2-a) put A[i,:] to dense array aval */
985     anz = ai[i+1] - ai[i];
986     adj = aj + ai[i];
987     ada = a_loc->a + ai[i];
988     for (j=0; j<anz; j++) {
989       aval[adj[j]] = ada[j];
990     }
991 
992     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
993     /*--------------------------------------------------------------*/
994     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
995     pnz = po->i[i+1] - po->i[i];
996     poJ = po->j + po->i[i];
997     pA  = po->a + po->i[i];
998     for (j=0; j<pnz; j++) {
999       row = poJ[j];
1000       cnz = coi[row+1] - coi[row];
1001       cj  = coj + coi[row];
1002       ca  = coa + coi[row];
1003       /* perform dense axpy */
1004       valtmp = pA[j];
1005       for (k=0; k<cnz; k++) {
1006         ca[k] += valtmp*aval[cj[k]];
1007       }
1008       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1009     }
1010 
1011     /* put the value into Cd (diagonal part) */
1012     pnz = pd->i[i+1] - pd->i[i];
1013     pdJ = pd->j + pd->i[i];
1014     pA  = pd->a + pd->i[i];
1015     for (j=0; j<pnz; j++) {
1016       row = pdJ[j];
1017       cnz = bi[row+1] - bi[row];
1018       cj  = bj + bi[row];
1019       ca  = ba + bi[row];
1020       /* perform dense axpy */
1021       valtmp = pA[j];
1022       for (k=0; k<cnz; k++) {
1023         ca[k] += valtmp*aval[cj[k]];
1024       }
1025       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1026     }
1027 
1028     /* zero the current row of Pt*A */
1029     aJ = aj + ai[i];
1030     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1031   }
1032 
1033   /* 3) send and recv matrix values coa */
1034   /*------------------------------------*/
1035   buf_ri = merge->buf_ri;
1036   buf_rj = merge->buf_rj;
1037   len_s  = merge->len_s;
1038   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1039   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1040 
1041   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1042   for (proc=0,k=0; proc<size; proc++) {
1043     if (!len_s[proc]) continue;
1044     i    = merge->owners_co[proc];
1045     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1046     k++;
1047   }
1048   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1049   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1050 
1051   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1052   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1053   ierr = PetscFree(coa);CHKERRQ(ierr);
1054 
1055   /* 4) insert local Cseq and received values into Cmpi */
1056   /*----------------------------------------------------*/
1057   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1058   for (k=0; k<merge->nrecv; k++) {
1059     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1060     nrows       = *(buf_ri_k[k]);
1061     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1062     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1063   }
1064 
1065   for (i=0; i<cm; i++) {
1066     row  = owners[rank] + i; /* global row index of C_seq */
1067     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1068     ba_i = ba + bi[i];
1069     bnz  = bi[i+1] - bi[i];
1070     /* add received vals into ba */
1071     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1072       /* i-th row */
1073       if (i == *nextrow[k]) {
1074         cnz    = *(nextci[k]+1) - *nextci[k];
1075         cj     = buf_rj[k] + *(nextci[k]);
1076         ca     = abuf_r[k] + *(nextci[k]);
1077         nextcj = 0;
1078         for (j=0; nextcj<cnz; j++) {
1079           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1080             ba_i[j] += ca[nextcj++];
1081           }
1082         }
1083         nextrow[k]++; nextci[k]++;
1084         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1085       }
1086     }
1087     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1088   }
1089   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1090   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1091 
1092   ierr = PetscFree(ba);CHKERRQ(ierr);
1093   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1094   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1095   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1096   ierr = PetscFree(aval);CHKERRQ(ierr);
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1101 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1102 #undef __FUNCT__
1103 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1104 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1105 {
1106   PetscErrorCode      ierr;
1107   Mat                 Cmpi,A_loc,POt,PDt;
1108   Mat_PtAPMPI         *ptap;
1109   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1110   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1111   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1112   PetscInt            nnz;
1113   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1114   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1115   PetscBT             lnkbt;
1116   MPI_Comm            comm;
1117   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1118   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1119   PetscInt            len,proc,*dnz,*onz,*owners;
1120   PetscInt            nzi,*bi,*bj;
1121   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1122   MPI_Request         *swaits,*rwaits;
1123   MPI_Status          *sstatus,rstatus;
1124   Mat_Merge_SeqsToMPI *merge;
1125   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1126   PetscReal           afill  =1.0,afill_tmp;
1127   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1128   PetscScalar         *vals;
1129   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1130 
1131   PetscFunctionBegin;
1132   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1133   /* check if matrix local sizes are compatible */
1134   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1135 
1136   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1137   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1138 
1139   /* create struct Mat_PtAPMPI and attached it to C later */
1140   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1141 
1142   /* get A_loc by taking all local rows of A */
1143   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1144 
1145   ptap->A_loc = A_loc;
1146 
1147   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1148   ai    = a_loc->i;
1149   aj    = a_loc->j;
1150 
1151   /* determine symbolic Co=(p->B)^T*A - send to others */
1152   /*----------------------------------------------------*/
1153   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1154   pdt  = (Mat_SeqAIJ*)PDt->data;
1155   pdti = pdt->i; pdtj = pdt->j;
1156 
1157   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1158   pot  = (Mat_SeqAIJ*)POt->data;
1159   poti = pot->i; potj = pot->j;
1160 
1161   /* then, compute symbolic Co = (p->B)^T*A */
1162   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1163   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1164   coi[0] = 0;
1165 
1166   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1167   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1168   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1169   current_space = free_space;
1170 
1171   /* create and initialize a linked list */
1172   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1173 
1174   for (i=0; i<pon; i++) {
1175     pnz = poti[i+1] - poti[i];
1176     ptJ = potj + poti[i];
1177     for (j=0; j<pnz; j++) {
1178       row  = ptJ[j]; /* row of A_loc == col of Pot */
1179       anz  = ai[row+1] - ai[row];
1180       Jptr = aj + ai[row];
1181       /* add non-zero cols of AP into the sorted linked list lnk */
1182       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1183     }
1184     nnz = lnk[0];
1185 
1186     /* If free space is not available, double the total space in the list */
1187     if (current_space->local_remaining<nnz) {
1188       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1189       nspacedouble++;
1190     }
1191 
1192     /* Copy data into free space, and zero out denserows */
1193     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1194 
1195     current_space->array           += nnz;
1196     current_space->local_used      += nnz;
1197     current_space->local_remaining -= nnz;
1198 
1199     coi[i+1] = coi[i] + nnz;
1200   }
1201 
1202   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1203   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1204 
1205   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1206   if (afill_tmp > afill) afill = afill_tmp;
1207 
1208   /* send j-array (coj) of Co to other processors */
1209   /*----------------------------------------------*/
1210   /* determine row ownership */
1211   ierr = PetscNew(&merge);CHKERRQ(ierr);
1212   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1213 
1214   merge->rowmap->n  = pn;
1215   merge->rowmap->bs = 1;
1216 
1217   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1218   owners = merge->rowmap->range;
1219 
1220   /* determine the number of messages to send, their lengths */
1221   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1222   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1223 
1224   len_s        = merge->len_s;
1225   merge->nsend = 0;
1226 
1227   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1228   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1229 
1230   proc = 0;
1231   for (i=0; i<pon; i++) {
1232     while (prmap[i] >= owners[proc+1]) proc++;
1233     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1234     len_s[proc] += coi[i+1] - coi[i];
1235   }
1236 
1237   len          = 0; /* max length of buf_si[] */
1238   owners_co[0] = 0;
1239   for (proc=0; proc<size; proc++) {
1240     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1241     if (len_si[proc]) {
1242       merge->nsend++;
1243       len_si[proc] = 2*(len_si[proc] + 1);
1244       len         += len_si[proc];
1245     }
1246   }
1247 
1248   /* determine the number and length of messages to receive for coi and coj  */
1249   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1250   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1251 
1252   /* post the Irecv and Isend of coj */
1253   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1254   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1255   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1256   for (proc=0, k=0; proc<size; proc++) {
1257     if (!len_s[proc]) continue;
1258     i    = owners_co[proc];
1259     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1260     k++;
1261   }
1262 
1263   /* receives and sends of coj are complete */
1264   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1265   for (i=0; i<merge->nrecv; i++) {
1266     PetscMPIInt icompleted;
1267     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1268   }
1269   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1270   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1271 
1272   /* send and recv coi */
1273   /*-------------------*/
1274   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1275   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1276   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1277   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1278   for (proc=0,k=0; proc<size; proc++) {
1279     if (!len_s[proc]) continue;
1280     /* form outgoing message for i-structure:
1281          buf_si[0]:                 nrows to be sent
1282                [1:nrows]:           row index (global)
1283                [nrows+1:2*nrows+1]: i-structure index
1284     */
1285     /*-------------------------------------------*/
1286     nrows       = len_si[proc]/2 - 1;
1287     buf_si_i    = buf_si + nrows+1;
1288     buf_si[0]   = nrows;
1289     buf_si_i[0] = 0;
1290     nrows       = 0;
1291     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1292       nzi               = coi[i+1] - coi[i];
1293       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1294       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1295       nrows++;
1296     }
1297     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1298     k++;
1299     buf_si += len_si[proc];
1300   }
1301   i = merge->nrecv;
1302   while (i--) {
1303     PetscMPIInt icompleted;
1304     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1305   }
1306   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1307   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1308   ierr = PetscFree(len_si);CHKERRQ(ierr);
1309   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1310   ierr = PetscFree(swaits);CHKERRQ(ierr);
1311   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1312   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1313 
1314   /* compute the local portion of C (mpi mat) */
1315   /*------------------------------------------*/
1316   /* allocate bi array and free space for accumulating nonzero column info */
1317   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1318   bi[0] = 0;
1319 
1320   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1321   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1322   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1323   current_space = free_space;
1324 
1325   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1326   for (k=0; k<merge->nrecv; k++) {
1327     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1328     nrows       = *buf_ri_k[k];
1329     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1330     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1331   }
1332 
1333   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1334   rmax = 0;
1335   for (i=0; i<pn; i++) {
1336     /* add pdt[i,:]*AP into lnk */
1337     pnz = pdti[i+1] - pdti[i];
1338     ptJ = pdtj + pdti[i];
1339     for (j=0; j<pnz; j++) {
1340       row  = ptJ[j];  /* row of AP == col of Pt */
1341       anz  = ai[row+1] - ai[row];
1342       Jptr = aj + ai[row];
1343       /* add non-zero cols of AP into the sorted linked list lnk */
1344       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1345     }
1346 
1347     /* add received col data into lnk */
1348     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1349       if (i == *nextrow[k]) { /* i-th row */
1350         nzi  = *(nextci[k]+1) - *nextci[k];
1351         Jptr = buf_rj[k] + *nextci[k];
1352         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1353         nextrow[k]++; nextci[k]++;
1354       }
1355     }
1356     nnz = lnk[0];
1357 
1358     /* if free space is not available, make more free space */
1359     if (current_space->local_remaining<nnz) {
1360       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1361       nspacedouble++;
1362     }
1363     /* copy data into free space, then initialize lnk */
1364     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1365     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1366 
1367     current_space->array           += nnz;
1368     current_space->local_used      += nnz;
1369     current_space->local_remaining -= nnz;
1370 
1371     bi[i+1] = bi[i] + nnz;
1372     if (nnz > rmax) rmax = nnz;
1373   }
1374   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1375 
1376   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1377   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1378 
1379   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1380   if (afill_tmp > afill) afill = afill_tmp;
1381   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1382   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1383   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1384 
1385   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1386   /*----------------------------------------------------------------------------------*/
1387   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1388 
1389   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1390   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1391   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1392   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1393   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1394   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1395   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1396   for (i=0; i<pn; i++) {
1397     row  = i + rstart;
1398     nnz  = bi[i+1] - bi[i];
1399     Jptr = bj + bi[i];
1400     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1401   }
1402   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1403   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1404   ierr = PetscFree(vals);CHKERRQ(ierr);
1405 
1406   merge->bi        = bi;
1407   merge->bj        = bj;
1408   merge->coi       = coi;
1409   merge->coj       = coj;
1410   merge->buf_ri    = buf_ri;
1411   merge->buf_rj    = buf_rj;
1412   merge->owners_co = owners_co;
1413 
1414   /* attach the supporting struct to Cmpi for reuse */
1415   c           = (Mat_MPIAIJ*)Cmpi->data;
1416   c->ptap     = ptap;
1417   ptap->api   = NULL;
1418   ptap->apj   = NULL;
1419   ptap->merge = merge;
1420   ptap->destroy   = Cmpi->ops->destroy;
1421   ptap->duplicate = Cmpi->ops->duplicate;
1422 
1423   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1424   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1425   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1426 
1427   *C = Cmpi;
1428 #if defined(PETSC_USE_INFO)
1429   if (bi[pn] != 0) {
1430     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1431     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1432   } else {
1433     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1434   }
1435 #endif
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 #undef __FUNCT__
1440 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1441 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1442 {
1443   PetscErrorCode      ierr;
1444   Mat_Merge_SeqsToMPI *merge;
1445   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1446   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1447   Mat_PtAPMPI         *ptap;
1448   PetscInt            *adj;
1449   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1450   MatScalar           *ada,*ca,valtmp;
1451   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1452   MPI_Comm            comm;
1453   PetscMPIInt         size,rank,taga,*len_s;
1454   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1455   PetscInt            **buf_ri,**buf_rj;
1456   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1457   MPI_Request         *s_waits,*r_waits;
1458   MPI_Status          *status;
1459   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1460   PetscInt            *ai,*aj,*coi,*coj;
1461   PetscInt            *poJ,*pdJ;
1462   Mat                 A_loc;
1463   Mat_SeqAIJ          *a_loc;
1464 
1465   PetscFunctionBegin;
1466   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1467   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1468   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1469 
1470   ptap  = c->ptap;
1471   merge = ptap->merge;
1472 
1473   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1474   /*------------------------------------------*/
1475   /* get data from symbolic products */
1476   coi    = merge->coi; coj = merge->coj;
1477   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1478   bi     = merge->bi; bj = merge->bj;
1479   owners = merge->rowmap->range;
1480   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1481 
1482   /* get A_loc by taking all local rows of A */
1483   A_loc = ptap->A_loc;
1484   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1485   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1486   ai    = a_loc->i;
1487   aj    = a_loc->j;
1488 
1489   for (i=0; i<am; i++) {
1490     anz = ai[i+1] - ai[i];
1491     adj = aj + ai[i];
1492     ada = a_loc->a + ai[i];
1493 
1494     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1495     /*-------------------------------------------------------------*/
1496     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1497     pnz = po->i[i+1] - po->i[i];
1498     poJ = po->j + po->i[i];
1499     pA  = po->a + po->i[i];
1500     for (j=0; j<pnz; j++) {
1501       row = poJ[j];
1502       cj  = coj + coi[row];
1503       ca  = coa + coi[row];
1504       /* perform sparse axpy */
1505       nexta  = 0;
1506       valtmp = pA[j];
1507       for (k=0; nexta<anz; k++) {
1508         if (cj[k] == adj[nexta]) {
1509           ca[k] += valtmp*ada[nexta];
1510           nexta++;
1511         }
1512       }
1513       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1514     }
1515 
1516     /* put the value into Cd (diagonal part) */
1517     pnz = pd->i[i+1] - pd->i[i];
1518     pdJ = pd->j + pd->i[i];
1519     pA  = pd->a + pd->i[i];
1520     for (j=0; j<pnz; j++) {
1521       row = pdJ[j];
1522       cj  = bj + bi[row];
1523       ca  = ba + bi[row];
1524       /* perform sparse axpy */
1525       nexta  = 0;
1526       valtmp = pA[j];
1527       for (k=0; nexta<anz; k++) {
1528         if (cj[k] == adj[nexta]) {
1529           ca[k] += valtmp*ada[nexta];
1530           nexta++;
1531         }
1532       }
1533       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1534     }
1535   }
1536 
1537   /* 3) send and recv matrix values coa */
1538   /*------------------------------------*/
1539   buf_ri = merge->buf_ri;
1540   buf_rj = merge->buf_rj;
1541   len_s  = merge->len_s;
1542   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1543   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1544 
1545   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1546   for (proc=0,k=0; proc<size; proc++) {
1547     if (!len_s[proc]) continue;
1548     i    = merge->owners_co[proc];
1549     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1550     k++;
1551   }
1552   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1553   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1554 
1555   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1556   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1557   ierr = PetscFree(coa);CHKERRQ(ierr);
1558 
1559   /* 4) insert local Cseq and received values into Cmpi */
1560   /*----------------------------------------------------*/
1561   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1562   for (k=0; k<merge->nrecv; k++) {
1563     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1564     nrows       = *(buf_ri_k[k]);
1565     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1566     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1567   }
1568 
1569   for (i=0; i<cm; i++) {
1570     row  = owners[rank] + i; /* global row index of C_seq */
1571     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1572     ba_i = ba + bi[i];
1573     bnz  = bi[i+1] - bi[i];
1574     /* add received vals into ba */
1575     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1576       /* i-th row */
1577       if (i == *nextrow[k]) {
1578         cnz    = *(nextci[k]+1) - *nextci[k];
1579         cj     = buf_rj[k] + *(nextci[k]);
1580         ca     = abuf_r[k] + *(nextci[k]);
1581         nextcj = 0;
1582         for (j=0; nextcj<cnz; j++) {
1583           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1584             ba_i[j] += ca[nextcj++];
1585           }
1586         }
1587         nextrow[k]++; nextci[k]++;
1588         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1589       }
1590     }
1591     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1592   }
1593   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1594   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1595 
1596   ierr = PetscFree(ba);CHKERRQ(ierr);
1597   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1598   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1599   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1604 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1605    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1606 #undef __FUNCT__
1607 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1608 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1609 {
1610   PetscErrorCode      ierr;
1611   Mat                 Cmpi,A_loc,POt,PDt;
1612   Mat_PtAPMPI         *ptap;
1613   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1614   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1615   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1616   PetscInt            nnz;
1617   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1618   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1619   MPI_Comm            comm;
1620   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1621   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1622   PetscInt            len,proc,*dnz,*onz,*owners;
1623   PetscInt            nzi,*bi,*bj;
1624   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1625   MPI_Request         *swaits,*rwaits;
1626   MPI_Status          *sstatus,rstatus;
1627   Mat_Merge_SeqsToMPI *merge;
1628   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1629   PetscReal           afill  =1.0,afill_tmp;
1630   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1631   PetscScalar         *vals;
1632   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1633   PetscTable          ta;
1634 
1635   PetscFunctionBegin;
1636   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1637   /* check if matrix local sizes are compatible */
1638   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1639 
1640   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1641   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1642 
1643   /* create struct Mat_PtAPMPI and attached it to C later */
1644   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1645 
1646   /* get A_loc by taking all local rows of A */
1647   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1648 
1649   ptap->A_loc = A_loc;
1650   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1651   ai          = a_loc->i;
1652   aj          = a_loc->j;
1653 
1654   /* determine symbolic Co=(p->B)^T*A - send to others */
1655   /*----------------------------------------------------*/
1656   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1657   pdt  = (Mat_SeqAIJ*)PDt->data;
1658   pdti = pdt->i; pdtj = pdt->j;
1659 
1660   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1661   pot  = (Mat_SeqAIJ*)POt->data;
1662   poti = pot->i; potj = pot->j;
1663 
1664   /* then, compute symbolic Co = (p->B)^T*A */
1665   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1666                          >= (num of nonzero rows of C_seq) - pn */
1667   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1668   coi[0] = 0;
1669 
1670   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1671   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1672   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1673   current_space = free_space;
1674 
1675   /* create and initialize a linked list */
1676   ierr = PetscTableCreate(aN,aN,&ta);CHKERRQ(ierr);
1677   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1678   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1679   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1680 
1681   for (i=0; i<pon; i++) {
1682     pnz = poti[i+1] - poti[i];
1683     ptJ = potj + poti[i];
1684     for (j=0; j<pnz; j++) {
1685       row  = ptJ[j]; /* row of A_loc == col of Pot */
1686       anz  = ai[row+1] - ai[row];
1687       Jptr = aj + ai[row];
1688       /* add non-zero cols of AP into the sorted linked list lnk */
1689       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1690     }
1691     nnz = lnk[0];
1692 
1693     /* If free space is not available, double the total space in the list */
1694     if (current_space->local_remaining<nnz) {
1695       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1696       nspacedouble++;
1697     }
1698 
1699     /* Copy data into free space, and zero out denserows */
1700     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1701 
1702     current_space->array           += nnz;
1703     current_space->local_used      += nnz;
1704     current_space->local_remaining -= nnz;
1705 
1706     coi[i+1] = coi[i] + nnz;
1707   }
1708 
1709   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1710   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1711   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1712 
1713   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1714   if (afill_tmp > afill) afill = afill_tmp;
1715 
1716   /* send j-array (coj) of Co to other processors */
1717   /*----------------------------------------------*/
1718   /* determine row ownership */
1719   ierr = PetscNew(&merge);CHKERRQ(ierr);
1720   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1721 
1722   merge->rowmap->n  = pn;
1723   merge->rowmap->bs = 1;
1724 
1725   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1726   owners = merge->rowmap->range;
1727 
1728   /* determine the number of messages to send, their lengths */
1729   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1730   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1731 
1732   len_s        = merge->len_s;
1733   merge->nsend = 0;
1734 
1735   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1736   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1737 
1738   proc = 0;
1739   for (i=0; i<pon; i++) {
1740     while (prmap[i] >= owners[proc+1]) proc++;
1741     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1742     len_s[proc] += coi[i+1] - coi[i];
1743   }
1744 
1745   len          = 0; /* max length of buf_si[] */
1746   owners_co[0] = 0;
1747   for (proc=0; proc<size; proc++) {
1748     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1749     if (len_si[proc]) {
1750       merge->nsend++;
1751       len_si[proc] = 2*(len_si[proc] + 1);
1752       len         += len_si[proc];
1753     }
1754   }
1755 
1756   /* determine the number and length of messages to receive for coi and coj  */
1757   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1758   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1759 
1760   /* post the Irecv and Isend of coj */
1761   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1762   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1763   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1764   for (proc=0, k=0; proc<size; proc++) {
1765     if (!len_s[proc]) continue;
1766     i    = owners_co[proc];
1767     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1768     k++;
1769   }
1770 
1771   /* receives and sends of coj are complete */
1772   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1773   for (i=0; i<merge->nrecv; i++) {
1774     PetscMPIInt icompleted;
1775     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1776   }
1777   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1778   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1779 
1780   /* add received column indices into table to update Armax */
1781   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1782     Jptr = buf_rj[k];
1783     for (j=0; j<merge->len_r[k]; j++) {
1784       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1785     }
1786   }
1787   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1788 
1789   /* send and recv coi */
1790   /*-------------------*/
1791   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1792   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1793   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1794   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1795   for (proc=0,k=0; proc<size; proc++) {
1796     if (!len_s[proc]) continue;
1797     /* form outgoing message for i-structure:
1798          buf_si[0]:                 nrows to be sent
1799                [1:nrows]:           row index (global)
1800                [nrows+1:2*nrows+1]: i-structure index
1801     */
1802     /*-------------------------------------------*/
1803     nrows       = len_si[proc]/2 - 1;
1804     buf_si_i    = buf_si + nrows+1;
1805     buf_si[0]   = nrows;
1806     buf_si_i[0] = 0;
1807     nrows       = 0;
1808     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1809       nzi               = coi[i+1] - coi[i];
1810       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1811       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1812       nrows++;
1813     }
1814     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1815     k++;
1816     buf_si += len_si[proc];
1817   }
1818   i = merge->nrecv;
1819   while (i--) {
1820     PetscMPIInt icompleted;
1821     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1822   }
1823   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1824   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1825   ierr = PetscFree(len_si);CHKERRQ(ierr);
1826   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1827   ierr = PetscFree(swaits);CHKERRQ(ierr);
1828   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1829   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1830 
1831   /* compute the local portion of C (mpi mat) */
1832   /*------------------------------------------*/
1833   /* allocate bi array and free space for accumulating nonzero column info */
1834   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1835   bi[0] = 0;
1836 
1837   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1838   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1839   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1840   current_space = free_space;
1841 
1842   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1843   for (k=0; k<merge->nrecv; k++) {
1844     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1845     nrows       = *buf_ri_k[k];
1846     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1847     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1848   }
1849 
1850   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1851   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1852   rmax = 0;
1853   for (i=0; i<pn; i++) {
1854     /* add pdt[i,:]*AP into lnk */
1855     pnz = pdti[i+1] - pdti[i];
1856     ptJ = pdtj + pdti[i];
1857     for (j=0; j<pnz; j++) {
1858       row  = ptJ[j];  /* row of AP == col of Pt */
1859       anz  = ai[row+1] - ai[row];
1860       Jptr = aj + ai[row];
1861       /* add non-zero cols of AP into the sorted linked list lnk */
1862       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1863     }
1864 
1865     /* add received col data into lnk */
1866     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1867       if (i == *nextrow[k]) { /* i-th row */
1868         nzi  = *(nextci[k]+1) - *nextci[k];
1869         Jptr = buf_rj[k] + *nextci[k];
1870         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1871         nextrow[k]++; nextci[k]++;
1872       }
1873     }
1874     nnz = lnk[0];
1875 
1876     /* if free space is not available, make more free space */
1877     if (current_space->local_remaining<nnz) {
1878       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1879       nspacedouble++;
1880     }
1881     /* copy data into free space, then initialize lnk */
1882     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1883     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1884 
1885     current_space->array           += nnz;
1886     current_space->local_used      += nnz;
1887     current_space->local_remaining -= nnz;
1888 
1889     bi[i+1] = bi[i] + nnz;
1890     if (nnz > rmax) rmax = nnz;
1891   }
1892   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1893 
1894   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1895   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1896   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1897   if (afill_tmp > afill) afill = afill_tmp;
1898   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1899   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1900 
1901   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1902   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1903 
1904   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1905   /*----------------------------------------------------------------------------------*/
1906   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1907 
1908   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1909   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1910   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1911   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1912   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1913   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1914   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1915   for (i=0; i<pn; i++) {
1916     row  = i + rstart;
1917     nnz  = bi[i+1] - bi[i];
1918     Jptr = bj + bi[i];
1919     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1920   }
1921   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1922   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1923   ierr = PetscFree(vals);CHKERRQ(ierr);
1924 
1925   merge->bi        = bi;
1926   merge->bj        = bj;
1927   merge->coi       = coi;
1928   merge->coj       = coj;
1929   merge->buf_ri    = buf_ri;
1930   merge->buf_rj    = buf_rj;
1931   merge->owners_co = owners_co;
1932 
1933   /* attach the supporting struct to Cmpi for reuse */
1934   c = (Mat_MPIAIJ*)Cmpi->data;
1935 
1936   c->ptap     = ptap;
1937   ptap->api   = NULL;
1938   ptap->apj   = NULL;
1939   ptap->merge = merge;
1940   ptap->apa   = NULL;
1941   ptap->destroy   = Cmpi->ops->destroy;
1942   ptap->duplicate = Cmpi->ops->duplicate;
1943 
1944   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1945   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1946   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1947 
1948   *C = Cmpi;
1949 #if defined(PETSC_USE_INFO)
1950   if (bi[pn] != 0) {
1951     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1952     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1953   } else {
1954     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1955   }
1956 #endif
1957   PetscFunctionReturn(0);
1958 }
1959