xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision fee518eb64eefbf9bda05b76e476d2dcf413f412)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc-private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18 
19   PetscFunctionBegin;
20   if (scall == MAT_INITIAL_MATRIX) {
21     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
22   }
23 
24   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
25   PetscFunctionReturn(0);
26 }
27 
28 #undef __FUNCT__
29 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
30 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
31 {
32   PetscErrorCode ierr;
33   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
34   Mat_PtAPMPI    *ptap = a->ptap;
35 
36   PetscFunctionBegin;
37   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
38   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
39   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
40   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
41   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
42   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
43   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
44   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
45   ierr = ptap->destroy(A);CHKERRQ(ierr);
46   ierr = PetscFree(ptap);CHKERRQ(ierr);
47   PetscFunctionReturn(0);
48 }
49 
50 #undef __FUNCT__
51 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
52 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
53 {
54   PetscErrorCode ierr;
55   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
56   Mat_PtAPMPI    *ptap = a->ptap;
57 
58   PetscFunctionBegin;
59   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
60 
61   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
62   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
63   PetscFunctionReturn(0);
64 }
65 
66 #undef __FUNCT__
67 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
68 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
72   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
73   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
74   PetscInt       *adi=ad->i,*adj,*aoi=ao->i,*aoj;
75   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
76   Mat_SeqAIJ     *p_loc,*p_oth;
77   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
78   PetscScalar    *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
79   PetscInt       cm   =C->rmap->n,anz,pnz;
80   Mat_PtAPMPI    *ptap=c->ptap;
81   PetscInt       *api,*apj,*apJ,i,j,k,row;
82   PetscInt       cstart=C->cmap->rstart;
83   PetscInt       cdnz,conz,k0,k1;
84 
85   PetscFunctionBegin;
86   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
87   /*-----------------------------------------------------*/
88   /* update numerical values of P_oth and P_loc */
89   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
90   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
91 
92   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
93   /*----------------------------------------------------------*/
94   /* get data from symbolic products */
95   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
96   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
97   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
98   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
99 
100   /* get apa for storing dense row A[i,:]*P */
101   apa = ptap->apa;
102 
103   api = ptap->api;
104   apj = ptap->apj;
105   for (i=0; i<cm; i++) {
106     /* diagonal portion of A */
107     anz = adi[i+1] - adi[i];
108     adj = ad->j + adi[i];
109     ada = ad->a + adi[i];
110     for (j=0; j<anz; j++) {
111       row = adj[j];
112       pnz = pi_loc[row+1] - pi_loc[row];
113       pj  = pj_loc + pi_loc[row];
114       pa  = pa_loc + pi_loc[row];
115 
116       /* perform dense axpy */
117       valtmp = ada[j];
118       for (k=0; k<pnz; k++) {
119         apa[pj[k]] += valtmp*pa[k];
120       }
121       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
122     }
123 
124     /* off-diagonal portion of A */
125     anz = aoi[i+1] - aoi[i];
126     aoj = ao->j + aoi[i];
127     aoa = ao->a + aoi[i];
128     for (j=0; j<anz; j++) {
129       row = aoj[j];
130       pnz = pi_oth[row+1] - pi_oth[row];
131       pj  = pj_oth + pi_oth[row];
132       pa  = pa_oth + pi_oth[row];
133 
134       /* perform dense axpy */
135       valtmp = aoa[j];
136       for (k=0; k<pnz; k++) {
137         apa[pj[k]] += valtmp*pa[k];
138       }
139       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
140     }
141 
142     /* set values in C */
143     apJ  = apj + api[i];
144     cdnz = cd->i[i+1] - cd->i[i];
145     conz = co->i[i+1] - co->i[i];
146 
147     /* 1st off-diagoanl part of C */
148     ca = coa + co->i[i];
149     k  = 0;
150     for (k0=0; k0<conz; k0++) {
151       if (apJ[k] >= cstart) break;
152       ca[k0]      = apa[apJ[k]];
153       apa[apJ[k]] = 0.0;
154       k++;
155     }
156 
157     /* diagonal part of C */
158     ca = cda + cd->i[i];
159     for (k1=0; k1<cdnz; k1++) {
160       ca[k1]      = apa[apJ[k]];
161       apa[apJ[k]] = 0.0;
162       k++;
163     }
164 
165     /* 2nd off-diagoanl part of C */
166     ca = coa + co->i[i];
167     for (; k0<conz; k0++) {
168       ca[k0]      = apa[apJ[k]];
169       apa[apJ[k]] = 0.0;
170       k++;
171     }
172   }
173   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
174   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
180 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
181 {
182   PetscErrorCode     ierr;
183   MPI_Comm           comm=((PetscObject)A)->comm;
184   Mat                Cmpi;
185   Mat_PtAPMPI        *ptap;
186   PetscFreeSpaceList free_space=NULL,current_space=NULL;
187   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
188   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
189   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
190   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
191   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
192   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
193   PetscBT            lnkbt;
194   PetscScalar        *apa;
195   PetscReal          afill;
196   PetscBool          scalable=PETSC_TRUE;
197   PetscInt           nlnk_max,armax,prmax;
198 
199   PetscFunctionBegin;
200   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
201     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
202   }
203 
204   ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
205   ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);CHKERRQ(ierr);
206   if (scalable) {
207     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);CHKERRQ(ierr);
208     PetscFunctionReturn(0);
209   }
210   ierr = PetscOptionsEnd();CHKERRQ(ierr);
211 
212   /* create struct Mat_PtAPMPI and attached it to C later */
213   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
214 
215   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
216   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
217 
218   /* get P_loc by taking all local rows of P */
219   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
220 
221   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
222   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
223   pi_loc = p_loc->i; pj_loc = p_loc->j;
224   pi_oth = p_oth->i; pj_oth = p_oth->j;
225 
226   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
227   /*-------------------------------------------------------------------*/
228   ierr      = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
229   ptap->api = api;
230   api[0]    = 0;
231 
232   /* create and initialize a linked list */
233   armax    = ad->rmax+ao->rmax;
234   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
235   nlnk_max = armax*prmax;
236   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
237   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
238 
239   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
240   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
241 
242   current_space = free_space;
243 
244   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
245   for (i=0; i<am; i++) {
246     /* diagonal portion of A */
247     nzi = adi[i+1] - adi[i];
248     for (j=0; j<nzi; j++) {
249       row  = *adj++;
250       pnz  = pi_loc[row+1] - pi_loc[row];
251       Jptr = pj_loc + pi_loc[row];
252       /* add non-zero cols of P into the sorted linked list lnk */
253       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
254     }
255     /* off-diagonal portion of A */
256     nzi = aoi[i+1] - aoi[i];
257     for (j=0; j<nzi; j++) {
258       row  = *aoj++;
259       pnz  = pi_oth[row+1] - pi_oth[row];
260       Jptr = pj_oth + pi_oth[row];
261       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
262     }
263 
264     apnz     = lnk[0];
265     api[i+1] = api[i] + apnz;
266 
267     /* if free space is not available, double the total space in the list */
268     if (current_space->local_remaining<apnz) {
269       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
270       nspacedouble++;
271     }
272 
273     /* Copy data into free space, then initialize lnk */
274     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
275     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
276 
277     current_space->array           += apnz;
278     current_space->local_used      += apnz;
279     current_space->local_remaining -= apnz;
280   }
281 
282   /* Allocate space for apj, initialize apj, and */
283   /* destroy list of free space and other temporary array(s) */
284   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
285   apj  = ptap->apj;
286   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
287   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
288 
289   /* malloc apa to store dense row A[i,:]*P */
290   ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
291   ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr);
292 
293   ptap->apa = apa;
294 
295   /* create and assemble symbolic parallel matrix Cmpi */
296   /*----------------------------------------------------*/
297   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
298   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
299   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
300 
301   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
302   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
303   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
304   for (i=0; i<am; i++) {
305     row  = i + rstart;
306     apnz = api[i+1] - api[i];
307     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
308     apj += apnz;
309   }
310   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
311   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
312 
313   ptap->destroy        = Cmpi->ops->destroy;
314   ptap->duplicate      = Cmpi->ops->duplicate;
315   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
316   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
317 
318   /* attach the supporting struct to Cmpi for reuse */
319   c       = (Mat_MPIAIJ*)Cmpi->data;
320   c->ptap = ptap;
321 
322   *C = Cmpi;
323 
324   /* set MatInfo */
325   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
326   if (afill < 1.0) afill = 1.0;
327   Cmpi->info.mallocs           = nspacedouble;
328   Cmpi->info.fill_ratio_given  = fill;
329   Cmpi->info.fill_ratio_needed = afill;
330 
331 #if defined(PETSC_USE_INFO)
332   if (api[am]) {
333     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
334     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
335   } else {
336     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
337   }
338 #endif
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
344 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
345 {
346   PetscErrorCode ierr;
347 
348   PetscFunctionBegin;
349   if (scall == MAT_INITIAL_MATRIX) {
350     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
351   }
352   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 typedef struct {
357   Mat         workB;
358   PetscScalar *rvalues,*svalues;
359   MPI_Request *rwaits,*swaits;
360 } MPIAIJ_MPIDense;
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
364 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
365 {
366   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
367   PetscErrorCode  ierr;
368 
369   PetscFunctionBegin;
370   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
371   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
372   ierr = PetscFree(contents);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
378 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
379 {
380   PetscErrorCode         ierr;
381   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
382   PetscInt               nz   = aij->B->cmap->n;
383   PetscContainer         container;
384   MPIAIJ_MPIDense        *contents;
385   VecScatter             ctx   = aij->Mvctx;
386   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
387   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
388   PetscInt               m     = A->rmap->n,n=B->cmap->n;
389 
390   PetscFunctionBegin;
391   ierr = MatCreate(((PetscObject)B)->comm,C);CHKERRQ(ierr);
392   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
393   ierr = MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);CHKERRQ(ierr);
394   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
395   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
396   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
397   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
398 
399   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
400 
401   ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr);
402   /* Create work matrix used to store off processor rows of B needed for local product */
403   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
404   /* Create work arrays needed */
405   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
406                       B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
407                       from->n,MPI_Request,&contents->rwaits,
408                       to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr);
409 
410   ierr = PetscContainerCreate(((PetscObject)A)->comm,&container);CHKERRQ(ierr);
411   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
412   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
413   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
414   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
415   PetscFunctionReturn(0);
416 }
417 
418 #undef __FUNCT__
419 #define __FUNCT__ "MatMPIDenseScatter"
420 /*
421     Performs an efficient scatter on the rows of B needed by this process; this is
422     a modification of the VecScatterBegin_() routines.
423 */
424 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
425 {
426   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
427   PetscErrorCode         ierr;
428   PetscScalar            *b,*w,*svalues,*rvalues;
429   VecScatter             ctx   = aij->Mvctx;
430   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
431   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
432   PetscInt               i,j,k;
433   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
434   PetscMPIInt            *sprocs,*rprocs,nrecvs;
435   MPI_Request            *swaits,*rwaits;
436   MPI_Comm               comm = ((PetscObject)A)->comm;
437   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
438   MPI_Status             status;
439   MPIAIJ_MPIDense        *contents;
440   PetscContainer         container;
441   Mat                    workB;
442 
443   PetscFunctionBegin;
444   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
445   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
446   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
447 
448   workB = *outworkB = contents->workB;
449   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
450   sindices = to->indices;
451   sstarts  = to->starts;
452   sprocs   = to->procs;
453   swaits   = contents->swaits;
454   svalues  = contents->svalues;
455 
456   rindices = from->indices;
457   rstarts  = from->starts;
458   rprocs   = from->procs;
459   rwaits   = contents->rwaits;
460   rvalues  = contents->rvalues;
461 
462   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
463   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
464 
465   for (i=0; i<from->n; i++) {
466     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
467   }
468 
469   for (i=0; i<to->n; i++) {
470     /* pack a message at a time */
471     CHKMEMQ;
472     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
473       for (k=0; k<ncols; k++) {
474         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
475       }
476     }
477     CHKMEMQ;
478     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
479   }
480 
481   nrecvs = from->n;
482   while (nrecvs) {
483     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
484     nrecvs--;
485     /* unpack a message at a time */
486     CHKMEMQ;
487     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
488       for (k=0; k<ncols; k++) {
489         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
490       }
491     }
492     CHKMEMQ;
493   }
494   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
495 
496   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
497   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
498   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
499   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
500   PetscFunctionReturn(0);
501 }
502 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
503 
504 #undef __FUNCT__
505 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
506 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
507 {
508   PetscErrorCode ierr;
509   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
510   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
511   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
512   Mat            workB;
513 
514   PetscFunctionBegin;
515   /* diagonal block of A times all local rows of B*/
516   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
517 
518   /* get off processor parts of B needed to complete the product */
519   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
520 
521   /* off-diagonal block of A times nonlocal rows of B */
522   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
523   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
524   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
525   PetscFunctionReturn(0);
526 }
527 
528 #undef __FUNCT__
529 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
530 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
531 {
532   PetscErrorCode ierr;
533   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
534   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
535   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
536   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
537   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
538   Mat_SeqAIJ     *p_loc,*p_oth;
539   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
540   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
541   PetscInt       cm          = C->rmap->n,anz,pnz;
542   Mat_PtAPMPI    *ptap       = c->ptap;
543   PetscScalar    *apa_sparse = ptap->apa;
544   PetscInt       *api,*apj,*apJ,i,j,k,row;
545   PetscInt       cstart = C->cmap->rstart;
546   PetscInt       cdnz,conz,k0,k1,nextp;
547 
548   PetscFunctionBegin;
549   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
550   /*-----------------------------------------------------*/
551   /* update numerical values of P_oth and P_loc */
552   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
553   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
554 
555   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
556   /*----------------------------------------------------------*/
557   /* get data from symbolic products */
558   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
559   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
560   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
561   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
562 
563   api = ptap->api;
564   apj = ptap->apj;
565   for (i=0; i<cm; i++) {
566     apJ = apj + api[i];
567 
568     /* diagonal portion of A */
569     anz = adi[i+1] - adi[i];
570     adj = ad->j + adi[i];
571     ada = ad->a + adi[i];
572     for (j=0; j<anz; j++) {
573       row = adj[j];
574       pnz = pi_loc[row+1] - pi_loc[row];
575       pj  = pj_loc + pi_loc[row];
576       pa  = pa_loc + pi_loc[row];
577       /* perform sparse axpy */
578       valtmp = ada[j];
579       nextp  = 0;
580       for (k=0; nextp<pnz; k++) {
581         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
582           apa_sparse[k] += valtmp*pa[nextp++];
583         }
584       }
585       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
586     }
587 
588     /* off-diagonal portion of A */
589     anz = aoi[i+1] - aoi[i];
590     aoj = ao->j + aoi[i];
591     aoa = ao->a + aoi[i];
592     for (j=0; j<anz; j++) {
593       row = aoj[j];
594       pnz = pi_oth[row+1] - pi_oth[row];
595       pj  = pj_oth + pi_oth[row];
596       pa  = pa_oth + pi_oth[row];
597       /* perform sparse axpy */
598       valtmp = aoa[j];
599       nextp  = 0;
600       for (k=0; nextp<pnz; k++) {
601         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
602           apa_sparse[k] += valtmp*pa[nextp++];
603         }
604       }
605       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
606     }
607 
608     /* set values in C */
609     cdnz = cd->i[i+1] - cd->i[i];
610     conz = co->i[i+1] - co->i[i];
611 
612     /* 1st off-diagoanl part of C */
613     ca = coa + co->i[i];
614     k  = 0;
615     for (k0=0; k0<conz; k0++) {
616       if (apJ[k] >= cstart) break;
617       ca[k0]        = apa_sparse[k];
618       apa_sparse[k] = 0.0;
619       k++;
620     }
621 
622     /* diagonal part of C */
623     ca = cda + cd->i[i];
624     for (k1=0; k1<cdnz; k1++) {
625       ca[k1]        = apa_sparse[k];
626       apa_sparse[k] = 0.0;
627       k++;
628     }
629 
630     /* 2nd off-diagoanl part of C */
631     ca = coa + co->i[i];
632     for (; k0<conz; k0++) {
633       ca[k0]        = apa_sparse[k];
634       apa_sparse[k] = 0.0;
635       k++;
636     }
637   }
638   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
644 #undef __FUNCT__
645 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
646 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
647 {
648   PetscErrorCode     ierr;
649   MPI_Comm           comm=((PetscObject)A)->comm;
650   Mat                Cmpi;
651   Mat_PtAPMPI        *ptap;
652   PetscFreeSpaceList free_space = NULL,current_space=NULL;
653   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
654   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
655   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
656   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
657   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
658   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
659   PetscInt           nlnk_max,armax,prmax;
660   PetscReal          afill;
661   PetscScalar        *apa;
662 
663   PetscFunctionBegin;
664   /* create struct Mat_PtAPMPI and attached it to C later */
665   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
666 
667   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
669 
670   /* get P_loc by taking all local rows of P */
671   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
672 
673   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
674   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
675   pi_loc = p_loc->i; pj_loc = p_loc->j;
676   pi_oth = p_oth->i; pj_oth = p_oth->j;
677 
678   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
679   /*-------------------------------------------------------------------*/
680   ierr      = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
681   ptap->api = api;
682   api[0]    = 0;
683 
684   /* create and initialize a linked list */
685   armax    = ad->rmax+ao->rmax;
686   prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
687   nlnk_max = armax*prmax;
688   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
689   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
690 
691   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
692   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
693 
694   current_space = free_space;
695 
696   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
697   for (i=0; i<am; i++) {
698     /* diagonal portion of A */
699     nzi = adi[i+1] - adi[i];
700     for (j=0; j<nzi; j++) {
701       row  = *adj++;
702       pnz  = pi_loc[row+1] - pi_loc[row];
703       Jptr = pj_loc + pi_loc[row];
704       /* add non-zero cols of P into the sorted linked list lnk */
705       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
706     }
707     /* off-diagonal portion of A */
708     nzi = aoi[i+1] - aoi[i];
709     for (j=0; j<nzi; j++) {
710       row  = *aoj++;
711       pnz  = pi_oth[row+1] - pi_oth[row];
712       Jptr = pj_oth + pi_oth[row];
713       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
714     }
715 
716     apnz     = *lnk;
717     api[i+1] = api[i] + apnz;
718     if (apnz > apnz_max) apnz_max = apnz;
719 
720     /* if free space is not available, double the total space in the list */
721     if (current_space->local_remaining<apnz) {
722       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
723       nspacedouble++;
724     }
725 
726     /* Copy data into free space, then initialize lnk */
727     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
728     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
729 
730     current_space->array           += apnz;
731     current_space->local_used      += apnz;
732     current_space->local_remaining -= apnz;
733   }
734 
735   /* Allocate space for apj, initialize apj, and */
736   /* destroy list of free space and other temporary array(s) */
737   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
738   apj  = ptap->apj;
739   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
740   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
741 
742   /* create and assemble symbolic parallel matrix Cmpi */
743   /*----------------------------------------------------*/
744   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
745   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
746   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
747   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
748   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
749   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
750 
751   /* malloc apa for assembly Cmpi */
752   ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
753   ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr);
754 
755   ptap->apa = apa;
756   for (i=0; i<am; i++) {
757     row  = i + rstart;
758     apnz = api[i+1] - api[i];
759     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
760     apj += apnz;
761   }
762   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
763   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
764 
765   ptap->destroy             = Cmpi->ops->destroy;
766   ptap->duplicate           = Cmpi->ops->duplicate;
767   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
768   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
769   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
770 
771   /* attach the supporting struct to Cmpi for reuse */
772   c       = (Mat_MPIAIJ*)Cmpi->data;
773   c->ptap = ptap;
774 
775   *C = Cmpi;
776 
777   /* set MatInfo */
778   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
779   if (afill < 1.0) afill = 1.0;
780   Cmpi->info.mallocs           = nspacedouble;
781   Cmpi->info.fill_ratio_given  = fill;
782   Cmpi->info.fill_ratio_needed = afill;
783 
784 #if defined(PETSC_USE_INFO)
785   if (api[am]) {
786     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
787     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
788   } else {
789     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
790   }
791 #endif
792   PetscFunctionReturn(0);
793 }
794 
795 /*-------------------------------------------------------------------------*/
796 #undef __FUNCT__
797 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
798 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
799 {
800   PetscErrorCode ierr;
801   PetscBool      scalable=PETSC_TRUE,viamatmatmult=PETSC_FALSE;
802 
803   PetscFunctionBegin;
804   ierr = PetscOptionsBool("-mattransposematmult_viamatmatmult","Use R=Pt and C=R*A","",viamatmatmult,&viamatmatmult,NULL);CHKERRQ(ierr);
805   if (viamatmatmult) {
806     Mat         Pt;
807     Mat_PtAPMPI *ptap;
808     Mat_MPIAIJ  *c;
809     if (scall == MAT_INITIAL_MATRIX) {
810       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
811       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
812 
813       c        = (Mat_MPIAIJ*)(*C)->data;
814       ptap     = c->ptap;
815       ptap->Pt = Pt;
816     } else if (scall == MAT_REUSE_MATRIX) {
817       c    = (Mat_MPIAIJ*)(*C)->data;
818       ptap = c->ptap;
819       Pt   = ptap->Pt;
820       ierr = MatTranspose(P,scall,&Pt);CHKERRQ(ierr);
821       ierr = MatMatMult(Pt,A,scall,fill,C);CHKERRQ(ierr);
822     } else SETERRQ(((PetscObject)A)->comm,PETSC_ERR_ARG_WRONGSTATE,"Not supported");
823     PetscFunctionReturn(0);
824   }
825 
826   if (scall == MAT_INITIAL_MATRIX) {
827     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
828     ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,NULL);CHKERRQ(ierr);
829     if  (scalable) {
830       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr);
831     } else {
832       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
833     }
834     ierr = PetscOptionsEnd();CHKERRQ(ierr);
835   }
836   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
837   PetscFunctionReturn(0);
838 }
839 
840 #undef __FUNCT__
841 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
842 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
843 {
844   PetscErrorCode      ierr;
845   Mat_Merge_SeqsToMPI *merge;
846   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
847   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
848   Mat_PtAPMPI         *ptap;
849   PetscInt            *adj,*aJ;
850   PetscInt            i,j,k,anz,pnz,row,*cj;
851   MatScalar           *ada,*aval,*ca,valtmp;
852   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
853   MPI_Comm            comm=((PetscObject)C)->comm;
854   PetscMPIInt         size,rank,taga,*len_s;
855   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
856   PetscInt            **buf_ri,**buf_rj;
857   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
858   MPI_Request         *s_waits,*r_waits;
859   MPI_Status          *status;
860   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
861   PetscInt            *ai,*aj,*coi,*coj;
862   PetscInt            *poJ,*pdJ;
863   Mat                 A_loc;
864   Mat_SeqAIJ          *a_loc;
865 
866   PetscFunctionBegin;
867   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
868   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
869 
870   ptap  = c->ptap;
871   merge = ptap->merge;
872 
873   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
874   /*--------------------------------------------------------------*/
875   /* get data from symbolic products */
876   coi  = merge->coi; coj = merge->coj;
877   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
878   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
879 
880   bi     = merge->bi; bj = merge->bj;
881   owners = merge->rowmap->range;
882   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
883   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
884 
885   /* get A_loc by taking all local rows of A */
886   A_loc = ptap->A_loc;
887   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
888   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
889   ai    = a_loc->i;
890   aj    = a_loc->j;
891 
892   ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */
893   ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
894 
895   for (i=0; i<am; i++) {
896     /* 2-a) put A[i,:] to dense array aval */
897     anz = ai[i+1] - ai[i];
898     adj = aj + ai[i];
899     ada = a_loc->a + ai[i];
900     for (j=0; j<anz; j++) {
901       aval[adj[j]] = ada[j];
902     }
903 
904     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
905     /*--------------------------------------------------------------*/
906     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
907     pnz = po->i[i+1] - po->i[i];
908     poJ = po->j + po->i[i];
909     pA  = po->a + po->i[i];
910     for (j=0; j<pnz; j++) {
911       row = poJ[j];
912       cnz = coi[row+1] - coi[row];
913       cj  = coj + coi[row];
914       ca  = coa + coi[row];
915       /* perform dense axpy */
916       valtmp = pA[j];
917       for (k=0; k<cnz; k++) {
918         ca[k] += valtmp*aval[cj[k]];
919       }
920       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
921     }
922 
923     /* put the value into Cd (diagonal part) */
924     pnz = pd->i[i+1] - pd->i[i];
925     pdJ = pd->j + pd->i[i];
926     pA  = pd->a + pd->i[i];
927     for (j=0; j<pnz; j++) {
928       row = pdJ[j];
929       cnz = bi[row+1] - bi[row];
930       cj  = bj + bi[row];
931       ca  = ba + bi[row];
932       /* perform dense axpy */
933       valtmp = pA[j];
934       for (k=0; k<cnz; k++) {
935         ca[k] += valtmp*aval[cj[k]];
936       }
937       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
938     }
939 
940     /* zero the current row of Pt*A */
941     aJ = aj + ai[i];
942     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
943   }
944 
945   /* 3) send and recv matrix values coa */
946   /*------------------------------------*/
947   buf_ri = merge->buf_ri;
948   buf_rj = merge->buf_rj;
949   len_s  = merge->len_s;
950   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
951   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
952 
953   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
954   for (proc=0,k=0; proc<size; proc++) {
955     if (!len_s[proc]) continue;
956     i    = merge->owners_co[proc];
957     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
958     k++;
959   }
960   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
961   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
962 
963   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
964   ierr = PetscFree(r_waits);CHKERRQ(ierr);
965   ierr = PetscFree(coa);CHKERRQ(ierr);
966 
967   /* 4) insert local Cseq and received values into Cmpi */
968   /*----------------------------------------------------*/
969   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
970   for (k=0; k<merge->nrecv; k++) {
971     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
972     nrows       = *(buf_ri_k[k]);
973     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
974     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
975   }
976 
977   for (i=0; i<cm; i++) {
978     row  = owners[rank] + i; /* global row index of C_seq */
979     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
980     ba_i = ba + bi[i];
981     bnz  = bi[i+1] - bi[i];
982     /* add received vals into ba */
983     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
984       /* i-th row */
985       if (i == *nextrow[k]) {
986         cnz    = *(nextci[k]+1) - *nextci[k];
987         cj     = buf_rj[k] + *(nextci[k]);
988         ca     = abuf_r[k] + *(nextci[k]);
989         nextcj = 0;
990         for (j=0; nextcj<cnz; j++) {
991           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
992             ba_i[j] += ca[nextcj++];
993           }
994         }
995         nextrow[k]++; nextci[k]++;
996         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
997       }
998     }
999     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1000   }
1001   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1002   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1003 
1004   ierr = PetscFree(ba);CHKERRQ(ierr);
1005   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1006   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1007   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1008   ierr = PetscFree(aval);CHKERRQ(ierr);
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1013 #undef __FUNCT__
1014 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1015 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1016 {
1017   PetscErrorCode      ierr;
1018   Mat                 Cmpi,A_loc,POt,PDt;
1019   Mat_PtAPMPI         *ptap;
1020   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1021   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1022   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1023   PetscInt            nnz;
1024   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1025   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1026   PetscBT             lnkbt;
1027   MPI_Comm            comm=((PetscObject)A)->comm;
1028   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1029   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1030   PetscInt            len,proc,*dnz,*onz,*owners;
1031   PetscInt            nzi,*bi,*bj;
1032   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1033   MPI_Request         *swaits,*rwaits;
1034   MPI_Status          *sstatus,rstatus;
1035   Mat_Merge_SeqsToMPI *merge;
1036   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1037   PetscReal           afill  =1.0,afill_tmp;
1038   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1039   PetscScalar         *vals;
1040   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1041 
1042   PetscFunctionBegin;
1043   /* check if matrix local sizes are compatible */
1044   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1045     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1046   }
1047 
1048   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1049   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1050 
1051   /* create struct Mat_PtAPMPI and attached it to C later */
1052   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1053 
1054   /* get A_loc by taking all local rows of A */
1055   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1056 
1057   ptap->A_loc = A_loc;
1058 
1059   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1060   ai    = a_loc->i;
1061   aj    = a_loc->j;
1062 
1063   /* determine symbolic Co=(p->B)^T*A - send to others */
1064   /*----------------------------------------------------*/
1065   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1066   pdt  = (Mat_SeqAIJ*)PDt->data;
1067   pdti = pdt->i; pdtj = pdt->j;
1068 
1069   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1070   pot  = (Mat_SeqAIJ*)POt->data;
1071   poti = pot->i; potj = pot->j;
1072 
1073   /* then, compute symbolic Co = (p->B)^T*A */
1074   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1075   ierr   = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1076   coi[0] = 0;
1077 
1078   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1079   nnz           = fill*(poti[pon] + ai[am]);
1080   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1081   current_space = free_space;
1082 
1083   /* create and initialize a linked list */
1084   i     = PetscMax(pdt->rmax,pot->rmax);
1085   Crmax = i*a_loc->rmax*size;
1086   if (!Crmax || Crmax > aN) Crmax = aN;
1087   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1088 
1089   for (i=0; i<pon; i++) {
1090     pnz = poti[i+1] - poti[i];
1091     ptJ = potj + poti[i];
1092     for (j=0; j<pnz; j++) {
1093       row  = ptJ[j]; /* row of A_loc == col of Pot */
1094       anz  = ai[row+1] - ai[row];
1095       Jptr = aj + ai[row];
1096       /* add non-zero cols of AP into the sorted linked list lnk */
1097       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1098     }
1099     nnz = lnk[0];
1100 
1101     /* If free space is not available, double the total space in the list */
1102     if (current_space->local_remaining<nnz) {
1103       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1104       nspacedouble++;
1105     }
1106 
1107     /* Copy data into free space, and zero out denserows */
1108     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1109 
1110     current_space->array           += nnz;
1111     current_space->local_used      += nnz;
1112     current_space->local_remaining -= nnz;
1113 
1114     coi[i+1] = coi[i] + nnz;
1115   }
1116 
1117   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1118   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1119 
1120   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1121   if (afill_tmp > afill) afill = afill_tmp;
1122 
1123   /* send j-array (coj) of Co to other processors */
1124   /*----------------------------------------------*/
1125   /* determine row ownership */
1126   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1127   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1128 
1129   merge->rowmap->n  = pn;
1130   merge->rowmap->bs = 1;
1131 
1132   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1133   owners = merge->rowmap->range;
1134 
1135   /* determine the number of messages to send, their lengths */
1136   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1137   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1138   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1139 
1140   len_s        = merge->len_s;
1141   merge->nsend = 0;
1142 
1143   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1144   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1145 
1146   proc = 0;
1147   for (i=0; i<pon; i++) {
1148     while (prmap[i] >= owners[proc+1]) proc++;
1149     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1150     len_s[proc] += coi[i+1] - coi[i];
1151   }
1152 
1153   len          = 0; /* max length of buf_si[] */
1154   owners_co[0] = 0;
1155   for (proc=0; proc<size; proc++) {
1156     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1157     if (len_si[proc]) {
1158       merge->nsend++;
1159       len_si[proc] = 2*(len_si[proc] + 1);
1160       len         += len_si[proc];
1161     }
1162   }
1163 
1164   /* determine the number and length of messages to receive for coi and coj  */
1165   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1166   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1167 
1168   /* post the Irecv and Isend of coj */
1169   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1170   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1171   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1172   for (proc=0, k=0; proc<size; proc++) {
1173     if (!len_s[proc]) continue;
1174     i    = owners_co[proc];
1175     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1176     k++;
1177   }
1178 
1179   /* receives and sends of coj are complete */
1180   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1181   for (i=0; i<merge->nrecv; i++) {
1182     PetscMPIInt icompleted;
1183     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1184   }
1185   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1186   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1187 
1188   /* send and recv coi */
1189   /*-------------------*/
1190   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1191   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1192   ierr   = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1193   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1194   for (proc=0,k=0; proc<size; proc++) {
1195     if (!len_s[proc]) continue;
1196     /* form outgoing message for i-structure:
1197          buf_si[0]:                 nrows to be sent
1198                [1:nrows]:           row index (global)
1199                [nrows+1:2*nrows+1]: i-structure index
1200     */
1201     /*-------------------------------------------*/
1202     nrows       = len_si[proc]/2 - 1;
1203     buf_si_i    = buf_si + nrows+1;
1204     buf_si[0]   = nrows;
1205     buf_si_i[0] = 0;
1206     nrows       = 0;
1207     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1208       nzi               = coi[i+1] - coi[i];
1209       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1210       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1211       nrows++;
1212     }
1213     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1214     k++;
1215     buf_si += len_si[proc];
1216   }
1217   i = merge->nrecv;
1218   while (i--) {
1219     PetscMPIInt icompleted;
1220     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1221   }
1222   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1223   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1224   ierr = PetscFree(len_si);CHKERRQ(ierr);
1225   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1226   ierr = PetscFree(swaits);CHKERRQ(ierr);
1227   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1228   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1229 
1230   /* compute the local portion of C (mpi mat) */
1231   /*------------------------------------------*/
1232   /* allocate bi array and free space for accumulating nonzero column info */
1233   ierr  = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1234   bi[0] = 0;
1235 
1236   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1237   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1238   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1239   current_space = free_space;
1240 
1241   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1242   for (k=0; k<merge->nrecv; k++) {
1243     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1244     nrows       = *buf_ri_k[k];
1245     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1246     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1247   }
1248 
1249   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1250   rmax = 0;
1251   for (i=0; i<pn; i++) {
1252     /* add pdt[i,:]*AP into lnk */
1253     pnz = pdti[i+1] - pdti[i];
1254     ptJ = pdtj + pdti[i];
1255     for (j=0; j<pnz; j++) {
1256       row  = ptJ[j];  /* row of AP == col of Pt */
1257       anz  = ai[row+1] - ai[row];
1258       Jptr = aj + ai[row];
1259       /* add non-zero cols of AP into the sorted linked list lnk */
1260       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1261     }
1262 
1263     /* add received col data into lnk */
1264     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1265       if (i == *nextrow[k]) { /* i-th row */
1266         nzi  = *(nextci[k]+1) - *nextci[k];
1267         Jptr = buf_rj[k] + *nextci[k];
1268         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1269         nextrow[k]++; nextci[k]++;
1270       }
1271     }
1272     nnz = lnk[0];
1273 
1274     /* if free space is not available, make more free space */
1275     if (current_space->local_remaining<nnz) {
1276       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1277       nspacedouble++;
1278     }
1279     /* copy data into free space, then initialize lnk */
1280     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1281     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1282 
1283     current_space->array           += nnz;
1284     current_space->local_used      += nnz;
1285     current_space->local_remaining -= nnz;
1286 
1287     bi[i+1] = bi[i] + nnz;
1288     if (nnz > rmax) rmax = nnz;
1289   }
1290   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1291 
1292   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1293   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1294 
1295   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1296   if (afill_tmp > afill) afill = afill_tmp;
1297   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1298   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1299   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1300 
1301   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1302   /*----------------------------------------------------------------------------------*/
1303   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1304   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1305 
1306   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1307   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1308   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1309   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1310   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1311   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1312   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1313   for (i=0; i<pn; i++) {
1314     row  = i + rstart;
1315     nnz  = bi[i+1] - bi[i];
1316     Jptr = bj + bi[i];
1317     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1318   }
1319   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1320   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1321   ierr = PetscFree(vals);CHKERRQ(ierr);
1322 
1323   merge->bi        = bi;
1324   merge->bj        = bj;
1325   merge->coi       = coi;
1326   merge->coj       = coj;
1327   merge->buf_ri    = buf_ri;
1328   merge->buf_rj    = buf_rj;
1329   merge->owners_co = owners_co;
1330   merge->destroy   = Cmpi->ops->destroy;
1331   merge->duplicate = Cmpi->ops->duplicate;
1332 
1333   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1334   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1335 
1336   /* attach the supporting struct to Cmpi for reuse */
1337   c           = (Mat_MPIAIJ*)Cmpi->data;
1338   c->ptap     = ptap;
1339   ptap->api   = NULL;
1340   ptap->apj   = NULL;
1341   ptap->merge = merge;
1342   ptap->rmax  = rmax;
1343 
1344   *C = Cmpi;
1345 #if defined(PETSC_USE_INFO)
1346   if (bi[pn] != 0) {
1347     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1348     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1349   } else {
1350     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1351   }
1352 #endif
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 #undef __FUNCT__
1357 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
1358 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1359 {
1360   PetscErrorCode      ierr;
1361   Mat_Merge_SeqsToMPI *merge;
1362   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1363   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1364   Mat_PtAPMPI         *ptap;
1365   PetscInt            *adj;
1366   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1367   MatScalar           *ada,*ca,valtmp;
1368   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1369   MPI_Comm            comm=((PetscObject)C)->comm;
1370   PetscMPIInt         size,rank,taga,*len_s;
1371   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1372   PetscInt            **buf_ri,**buf_rj;
1373   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1374   MPI_Request         *s_waits,*r_waits;
1375   MPI_Status          *status;
1376   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1377   PetscInt            *ai,*aj,*coi,*coj;
1378   PetscInt            *poJ,*pdJ;
1379   Mat                 A_loc;
1380   Mat_SeqAIJ          *a_loc;
1381 
1382   PetscFunctionBegin;
1383   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1384   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1385 
1386   ptap  = c->ptap;
1387   merge = ptap->merge;
1388 
1389   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1390   /*------------------------------------------*/
1391   /* get data from symbolic products */
1392   coi    = merge->coi; coj = merge->coj;
1393   ierr   = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
1394   ierr   = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
1395   bi     = merge->bi; bj = merge->bj;
1396   owners = merge->rowmap->range;
1397   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1398   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1399 
1400   /* get A_loc by taking all local rows of A */
1401   A_loc = ptap->A_loc;
1402   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1403   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1404   ai    = a_loc->i;
1405   aj    = a_loc->j;
1406 
1407   for (i=0; i<am; i++) {
1408     anz = ai[i+1] - ai[i];
1409     adj = aj + ai[i];
1410     ada = a_loc->a + ai[i];
1411 
1412     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1413     /*-------------------------------------------------------------*/
1414     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1415     pnz = po->i[i+1] - po->i[i];
1416     poJ = po->j + po->i[i];
1417     pA  = po->a + po->i[i];
1418     for (j=0; j<pnz; j++) {
1419       row = poJ[j];
1420       cj  = coj + coi[row];
1421       ca  = coa + coi[row];
1422       /* perform sparse axpy */
1423       nexta  = 0;
1424       valtmp = pA[j];
1425       for (k=0; nexta<anz; k++) {
1426         if (cj[k] == adj[nexta]) {
1427           ca[k] += valtmp*ada[nexta];
1428           nexta++;
1429         }
1430       }
1431       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1432     }
1433 
1434     /* put the value into Cd (diagonal part) */
1435     pnz = pd->i[i+1] - pd->i[i];
1436     pdJ = pd->j + pd->i[i];
1437     pA  = pd->a + pd->i[i];
1438     for (j=0; j<pnz; j++) {
1439       row = pdJ[j];
1440       cj  = bj + bi[row];
1441       ca  = ba + bi[row];
1442       /* perform sparse axpy */
1443       nexta  = 0;
1444       valtmp = pA[j];
1445       for (k=0; nexta<anz; k++) {
1446         if (cj[k] == adj[nexta]) {
1447           ca[k] += valtmp*ada[nexta];
1448           nexta++;
1449         }
1450       }
1451       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1452     }
1453   }
1454 
1455   /* 3) send and recv matrix values coa */
1456   /*------------------------------------*/
1457   buf_ri = merge->buf_ri;
1458   buf_rj = merge->buf_rj;
1459   len_s  = merge->len_s;
1460   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1461   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1462 
1463   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
1464   for (proc=0,k=0; proc<size; proc++) {
1465     if (!len_s[proc]) continue;
1466     i    = merge->owners_co[proc];
1467     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1468     k++;
1469   }
1470   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1471   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1472 
1473   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1474   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1475   ierr = PetscFree(coa);CHKERRQ(ierr);
1476 
1477   /* 4) insert local Cseq and received values into Cmpi */
1478   /*----------------------------------------------------*/
1479   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1480   for (k=0; k<merge->nrecv; k++) {
1481     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1482     nrows       = *(buf_ri_k[k]);
1483     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1484     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1485   }
1486 
1487   for (i=0; i<cm; i++) {
1488     row  = owners[rank] + i; /* global row index of C_seq */
1489     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1490     ba_i = ba + bi[i];
1491     bnz  = bi[i+1] - bi[i];
1492     /* add received vals into ba */
1493     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1494       /* i-th row */
1495       if (i == *nextrow[k]) {
1496         cnz    = *(nextci[k]+1) - *nextci[k];
1497         cj     = buf_rj[k] + *(nextci[k]);
1498         ca     = abuf_r[k] + *(nextci[k]);
1499         nextcj = 0;
1500         for (j=0; nextcj<cnz; j++) {
1501           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1502             ba_i[j] += ca[nextcj++];
1503           }
1504         }
1505         nextrow[k]++; nextci[k]++;
1506         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1507       }
1508     }
1509     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1510   }
1511   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1512   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1513 
1514   ierr = PetscFree(ba);CHKERRQ(ierr);
1515   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1516   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1517   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1522 #undef __FUNCT__
1523 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
1524 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1525 {
1526   PetscErrorCode      ierr;
1527   Mat                 Cmpi,A_loc,POt,PDt;
1528   Mat_PtAPMPI         *ptap;
1529   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1530   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1531   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1532   PetscInt            nnz;
1533   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1534   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1535   MPI_Comm            comm=((PetscObject)A)->comm;
1536   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1537   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1538   PetscInt            len,proc,*dnz,*onz,*owners;
1539   PetscInt            nzi,*bi,*bj;
1540   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1541   MPI_Request         *swaits,*rwaits;
1542   MPI_Status          *sstatus,rstatus;
1543   Mat_Merge_SeqsToMPI *merge;
1544   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1545   PetscReal           afill  =1.0,afill_tmp;
1546   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1547   PetscScalar         *vals;
1548   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1549 
1550   PetscFunctionBegin;
1551   /* check if matrix local sizes are compatible */
1552   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1553     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1554   }
1555 
1556   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1557   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1558 
1559   /* create struct Mat_PtAPMPI and attached it to C later */
1560   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1561 
1562   /* get A_loc by taking all local rows of A */
1563   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1564 
1565   ptap->A_loc = A_loc;
1566   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1567   ai          = a_loc->i;
1568   aj          = a_loc->j;
1569 
1570   /* determine symbolic Co=(p->B)^T*A - send to others */
1571   /*----------------------------------------------------*/
1572   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1573   pdt  = (Mat_SeqAIJ*)PDt->data;
1574   pdti = pdt->i; pdtj = pdt->j;
1575 
1576   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1577   pot  = (Mat_SeqAIJ*)POt->data;
1578   poti = pot->i; potj = pot->j;
1579 
1580   /* then, compute symbolic Co = (p->B)^T*A */
1581   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1582                          >= (num of nonzero rows of C_seq) - pn */
1583   ierr   = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1584   coi[0] = 0;
1585 
1586   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1587   nnz           = fill*(poti[pon] + ai[am]);
1588   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1589   current_space = free_space;
1590 
1591   /* create and initialize a linked list */
1592   i     = PetscMax(pdt->rmax,pot->rmax);
1593   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1594   if (!Crmax || Crmax > aN) Crmax = aN;
1595   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1596 
1597   for (i=0; i<pon; i++) {
1598     pnz = poti[i+1] - poti[i];
1599     ptJ = potj + poti[i];
1600     for (j=0; j<pnz; j++) {
1601       row  = ptJ[j]; /* row of A_loc == col of Pot */
1602       anz  = ai[row+1] - ai[row];
1603       Jptr = aj + ai[row];
1604       /* add non-zero cols of AP into the sorted linked list lnk */
1605       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1606     }
1607     nnz = lnk[0];
1608 
1609     /* If free space is not available, double the total space in the list */
1610     if (current_space->local_remaining<nnz) {
1611       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1612       nspacedouble++;
1613     }
1614 
1615     /* Copy data into free space, and zero out denserows */
1616     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1617 
1618     current_space->array           += nnz;
1619     current_space->local_used      += nnz;
1620     current_space->local_remaining -= nnz;
1621 
1622     coi[i+1] = coi[i] + nnz;
1623   }
1624 
1625   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1626   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1627 
1628   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1629   if (afill_tmp > afill) afill = afill_tmp;
1630 
1631   /* send j-array (coj) of Co to other processors */
1632   /*----------------------------------------------*/
1633   /* determine row ownership */
1634   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1635   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1636 
1637   merge->rowmap->n  = pn;
1638   merge->rowmap->bs = 1;
1639 
1640   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1641   owners = merge->rowmap->range;
1642 
1643   /* determine the number of messages to send, their lengths */
1644   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1645   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1646   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1647 
1648   len_s        = merge->len_s;
1649   merge->nsend = 0;
1650 
1651   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1652   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1653 
1654   proc = 0;
1655   for (i=0; i<pon; i++) {
1656     while (prmap[i] >= owners[proc+1]) proc++;
1657     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1658     len_s[proc] += coi[i+1] - coi[i];
1659   }
1660 
1661   len          = 0; /* max length of buf_si[] */
1662   owners_co[0] = 0;
1663   for (proc=0; proc<size; proc++) {
1664     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1665     if (len_si[proc]) {
1666       merge->nsend++;
1667       len_si[proc] = 2*(len_si[proc] + 1);
1668       len         += len_si[proc];
1669     }
1670   }
1671 
1672   /* determine the number and length of messages to receive for coi and coj  */
1673   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1674   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1675 
1676   /* post the Irecv and Isend of coj */
1677   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1678   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1679   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1680   for (proc=0, k=0; proc<size; proc++) {
1681     if (!len_s[proc]) continue;
1682     i    = owners_co[proc];
1683     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1684     k++;
1685   }
1686 
1687   /* receives and sends of coj are complete */
1688   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1689   for (i=0; i<merge->nrecv; i++) {
1690     PetscMPIInt icompleted;
1691     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1692   }
1693   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1694   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1695 
1696   /* send and recv coi */
1697   /*-------------------*/
1698   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1699   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1700   ierr   = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1701   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1702   for (proc=0,k=0; proc<size; proc++) {
1703     if (!len_s[proc]) continue;
1704     /* form outgoing message for i-structure:
1705          buf_si[0]:                 nrows to be sent
1706                [1:nrows]:           row index (global)
1707                [nrows+1:2*nrows+1]: i-structure index
1708     */
1709     /*-------------------------------------------*/
1710     nrows       = len_si[proc]/2 - 1;
1711     buf_si_i    = buf_si + nrows+1;
1712     buf_si[0]   = nrows;
1713     buf_si_i[0] = 0;
1714     nrows       = 0;
1715     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1716       nzi               = coi[i+1] - coi[i];
1717       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1718       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1719       nrows++;
1720     }
1721     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1722     k++;
1723     buf_si += len_si[proc];
1724   }
1725   i = merge->nrecv;
1726   while (i--) {
1727     PetscMPIInt icompleted;
1728     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1729   }
1730   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1731   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1732   ierr = PetscFree(len_si);CHKERRQ(ierr);
1733   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1734   ierr = PetscFree(swaits);CHKERRQ(ierr);
1735   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1736   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1737 
1738   /* compute the local portion of C (mpi mat) */
1739   /*------------------------------------------*/
1740   /* allocate bi array and free space for accumulating nonzero column info */
1741   ierr  = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1742   bi[0] = 0;
1743 
1744   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1745   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1746   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1747   current_space = free_space;
1748 
1749   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1750   for (k=0; k<merge->nrecv; k++) {
1751     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1752     nrows       = *buf_ri_k[k];
1753     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1754     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1755   }
1756 
1757   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1758   rmax = 0;
1759   for (i=0; i<pn; i++) {
1760     /* add pdt[i,:]*AP into lnk */
1761     pnz = pdti[i+1] - pdti[i];
1762     ptJ = pdtj + pdti[i];
1763     for (j=0; j<pnz; j++) {
1764       row  = ptJ[j];  /* row of AP == col of Pt */
1765       anz  = ai[row+1] - ai[row];
1766       Jptr = aj + ai[row];
1767       /* add non-zero cols of AP into the sorted linked list lnk */
1768       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1769     }
1770 
1771     /* add received col data into lnk */
1772     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1773       if (i == *nextrow[k]) { /* i-th row */
1774         nzi  = *(nextci[k]+1) - *nextci[k];
1775         Jptr = buf_rj[k] + *nextci[k];
1776         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1777         nextrow[k]++; nextci[k]++;
1778       }
1779     }
1780     nnz = lnk[0];
1781 
1782     /* if free space is not available, make more free space */
1783     if (current_space->local_remaining<nnz) {
1784       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1785       nspacedouble++;
1786     }
1787     /* copy data into free space, then initialize lnk */
1788     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1789     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1790 
1791     current_space->array           += nnz;
1792     current_space->local_used      += nnz;
1793     current_space->local_remaining -= nnz;
1794 
1795     bi[i+1] = bi[i] + nnz;
1796     if (nnz > rmax) rmax = nnz;
1797   }
1798   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1799 
1800   ierr      = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1801   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1802   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1803   if (afill_tmp > afill) afill = afill_tmp;
1804   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1805   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1806   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1807 
1808   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1809   /*----------------------------------------------------------------------------------*/
1810   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1811   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1812 
1813   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1814   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1815   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1816   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1817   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1818   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1819   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1820   for (i=0; i<pn; i++) {
1821     row  = i + rstart;
1822     nnz  = bi[i+1] - bi[i];
1823     Jptr = bj + bi[i];
1824     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1825   }
1826   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1827   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1828   ierr = PetscFree(vals);CHKERRQ(ierr);
1829 
1830   merge->bi        = bi;
1831   merge->bj        = bj;
1832   merge->coi       = coi;
1833   merge->coj       = coj;
1834   merge->buf_ri    = buf_ri;
1835   merge->buf_rj    = buf_rj;
1836   merge->owners_co = owners_co;
1837   merge->destroy   = Cmpi->ops->destroy;
1838   merge->duplicate = Cmpi->ops->duplicate;
1839 
1840   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1841   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1842 
1843   /* attach the supporting struct to Cmpi for reuse */
1844   c = (Mat_MPIAIJ*)Cmpi->data;
1845 
1846   c->ptap     = ptap;
1847   ptap->api   = NULL;
1848   ptap->apj   = NULL;
1849   ptap->merge = merge;
1850   ptap->rmax  = rmax;
1851   ptap->apa   = NULL;
1852 
1853   *C = Cmpi;
1854 #if defined(PETSC_USE_INFO)
1855   if (bi[pn] != 0) {
1856     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1857     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1858   } else {
1859     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1860   }
1861 #endif
1862   PetscFunctionReturn(0);
1863 }
1864