xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision a201590f66d78a0ee8e1bf4e6f9fcce39f32a809)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
14 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
15 {
16   PetscErrorCode ierr;
17 
18   PetscFunctionBegin;
19   if (scall == MAT_INITIAL_MATRIX){
20     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
21     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
22     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
23   }
24 
25   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
26   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
27   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
28   PetscFunctionReturn(0);
29 }
30 
31 #undef __FUNCT__
32 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
33 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
34 {
35   PetscErrorCode ierr;
36   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
37   Mat_PtAPMPI    *ptap=a->ptap;
38 
39   PetscFunctionBegin;
40   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
41   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
42   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
43   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
44   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
45   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
46   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
47   ierr = ptap->destroy(A);CHKERRQ(ierr);
48   ierr = PetscFree(ptap);CHKERRQ(ierr);
49   PetscFunctionReturn(0);
50 }
51 
52 #undef __FUNCT__
53 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
54 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
55 {
56   PetscErrorCode     ierr;
57   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data;
58   Mat_PtAPMPI        *ptap=a->ptap;
59 
60   PetscFunctionBegin;
61   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
62   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
63   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
64   PetscFunctionReturn(0);
65 }
66 
67 #undef __FUNCT__
68 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
69 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
70 {
71   PetscErrorCode     ierr;
72   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
73   Mat_SeqAIJ         *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
74   Mat_SeqAIJ         *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
75   PetscInt           *adi=ad->i,*adj,*aoi=ao->i,*aoj;
76   PetscScalar        *ada,*aoa,*cda=cd->a,*coa=co->a;
77   Mat_SeqAIJ         *p_loc,*p_oth;
78   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
79   PetscScalar        *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
80   PetscInt           cm=C->rmap->n,anz,pnz;
81   Mat_PtAPMPI        *ptap=c->ptap;
82   PetscInt           *api,*apj,*apJ,i,j,k,row;
83   PetscInt           cstart=C->cmap->rstart;
84   PetscInt           cdnz,conz,k0,k1;
85 
86   PetscFunctionBegin;
87   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
88   /*-----------------------------------------------------*/
89   /* update numerical values of P_oth and P_loc */
90   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
91   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
92 
93   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
94   /*----------------------------------------------------------*/
95   /* get data from symbolic products */
96   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
97   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
98   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
99   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
100 
101   /* get apa for storing dense row A[i,:]*P */
102   apa = ptap->apa;
103 
104   api = ptap->api;
105   apj = ptap->apj;
106   for (i=0; i<cm; i++) {
107     /* diagonal portion of A */
108     anz = adi[i+1] - adi[i];
109     adj = ad->j + adi[i];
110     ada = ad->a + adi[i];
111     for (j=0; j<anz; j++) {
112       row = adj[j];
113       pnz = pi_loc[row+1] - pi_loc[row];
114       pj  = pj_loc + pi_loc[row];
115       pa  = pa_loc + pi_loc[row];
116 
117       /* perform dense axpy */
118       valtmp = ada[j];
119       for (k=0; k<pnz; k++){
120         apa[pj[k]] += valtmp*pa[k];
121       }
122       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
123     }
124 
125     /* off-diagonal portion of A */
126     anz = aoi[i+1] - aoi[i];
127     aoj = ao->j + aoi[i];
128     aoa = ao->a + aoi[i];
129     for (j=0; j<anz; j++) {
130       row = aoj[j];
131       pnz = pi_oth[row+1] - pi_oth[row];
132       pj  = pj_oth + pi_oth[row];
133       pa  = pa_oth + pi_oth[row];
134 
135       /* perform dense axpy */
136       valtmp = aoa[j];
137       for (k=0; k<pnz; k++){
138         apa[pj[k]] += valtmp*pa[k];
139       }
140       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
141     }
142 
143     /* set values in C */
144     apJ = apj + api[i];
145     cdnz = cd->i[i+1] - cd->i[i];
146     conz = co->i[i+1] - co->i[i];
147 
148     /* 1st off-diagoanl part of C */
149     ca = coa + co->i[i];
150     k  = 0;
151     for (k0=0; k0<conz; k0++){
152       if (apJ[k] >= cstart) break;
153       ca[k0]      = apa[apJ[k]];
154       apa[apJ[k]] = 0.0;
155       k++;
156     }
157 
158     /* diagonal part of C */
159     ca = cda + cd->i[i];
160     for (k1=0; k1<cdnz; k1++){
161       ca[k1]      = apa[apJ[k]];
162       apa[apJ[k]] = 0.0;
163       k++;
164     }
165 
166     /* 2nd off-diagoanl part of C */
167     ca = coa + co->i[i];
168     for (; k0<conz; k0++){
169       ca[k0]      = apa[apJ[k]];
170       apa[apJ[k]] = 0.0;
171       k++;
172     }
173   }
174   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
175   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
181 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
182 {
183   PetscErrorCode       ierr;
184   MPI_Comm             comm=((PetscObject)A)->comm;
185   Mat                  Cmpi;
186   Mat_PtAPMPI          *ptap;
187   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
188   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*c;
189   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
190   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
191   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
192   PetscInt             *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
193   PetscInt             am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
194   PetscBT              lnkbt;
195   PetscScalar          *apa;
196   PetscReal            afill;
197   PetscBool            scalable=PETSC_FALSE;
198   PetscInt             nlnk_max,armax,prmax;
199 
200   PetscFunctionBegin;
201   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend){
202     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
203   }
204 
205   ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
206     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
207     if (scalable){
208       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);;CHKERRQ(ierr);
209       PetscFunctionReturn(0);
210     }
211   ierr = PetscOptionsEnd();CHKERRQ(ierr);
212 
213   /* create struct Mat_PtAPMPI and attached it to C later */
214   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
215 
216   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
217   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
218 
219   /* get P_loc by taking all local rows of P */
220   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
221 
222   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
223   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
224   pi_loc = p_loc->i; pj_loc = p_loc->j;
225   pi_oth = p_oth->i; pj_oth = p_oth->j;
226 
227   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
228   /*-------------------------------------------------------------------*/
229   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
230   ptap->api = api;
231   api[0]    = 0;
232 
233   /* create and initialize a linked list */
234   armax = ad->rmax+ao->rmax;
235   prmax = PetscMax(p_loc->rmax,p_oth->rmax);
236   nlnk_max = armax*prmax;
237   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
238   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
239 
240   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
241   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
242   current_space = free_space;
243 
244   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
245   for (i=0; i<am; i++) {
246     apnz = 0;
247     /* diagonal portion of A */
248     nzi = adi[i+1] - adi[i];
249     for (j=0; j<nzi; j++){
250       row = *adj++;
251       pnz = pi_loc[row+1] - pi_loc[row];
252       Jptr  = pj_loc + pi_loc[row];
253       /* add non-zero cols of P into the sorted linked list lnk */
254       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
255     }
256     /* off-diagonal portion of A */
257     nzi = aoi[i+1] - aoi[i];
258     for (j=0; j<nzi; j++){
259       row = *aoj++;
260       pnz = pi_oth[row+1] - pi_oth[row];
261       Jptr  = pj_oth + pi_oth[row];
262       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
263     }
264 
265     apnz     = lnk[0];
266     api[i+1] = api[i] + apnz;
267 
268     /* if free space is not available, double the total space in the list */
269     if (current_space->local_remaining<apnz) {
270       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
271       nspacedouble++;
272     }
273 
274     /* Copy data into free space, then initialize lnk */
275     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
276     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
277     current_space->array           += apnz;
278     current_space->local_used      += apnz;
279     current_space->local_remaining -= apnz;
280   }
281 
282   /* Allocate space for apj, initialize apj, and */
283   /* destroy list of free space and other temporary array(s) */
284   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
285   apj  = ptap->apj;
286   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
287   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
288 
289   /* malloc apa to store dense row A[i,:]*P */
290   ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
291   ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr);
292   ptap->apa = apa;
293 
294   /* create and assemble symbolic parallel matrix Cmpi */
295   /*----------------------------------------------------*/
296   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
297   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
298   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
299 
300   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
301   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
302   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
303   for (i=0; i<am; i++){
304     row  = i + rstart;
305     apnz = api[i+1] - api[i];
306     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
307     apj += apnz;
308   }
309   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
311 
312   ptap->destroy             = Cmpi->ops->destroy;
313   ptap->duplicate           = Cmpi->ops->duplicate;
314   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
315   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
316   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
317 
318   /* attach the supporting struct to Cmpi for reuse */
319   c = (Mat_MPIAIJ*)Cmpi->data;
320   c->ptap  = ptap;
321 
322   *C = Cmpi;
323 
324   /* set MatInfo */
325   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
326   if (afill < 1.0) afill = 1.0;
327   Cmpi->info.mallocs           = nspacedouble;
328   Cmpi->info.fill_ratio_given  = fill;
329   Cmpi->info.fill_ratio_needed = afill;
330 
331 #if defined(PETSC_USE_INFO)
332   if (api[am]) {
333     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
334     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
335   } else {
336     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
337   }
338 #endif
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
344 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
345 {
346   PetscErrorCode ierr;
347 
348   PetscFunctionBegin;
349   if (scall == MAT_INITIAL_MATRIX){
350     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
351   }
352   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 typedef struct {
357   Mat         workB;
358   PetscScalar *rvalues,*svalues;
359   MPI_Request *rwaits,*swaits;
360 } MPIAIJ_MPIDense;
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
364 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
365 {
366   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
367   PetscErrorCode  ierr;
368 
369   PetscFunctionBegin;
370   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
371   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
372   ierr = PetscFree(contents);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
378 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
379 {
380   PetscErrorCode         ierr;
381   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
382   PetscInt               nz = aij->B->cmap->n;
383   PetscContainer         container;
384   MPIAIJ_MPIDense        *contents;
385   VecScatter             ctx = aij->Mvctx;
386   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
387   VecScatter_MPI_General *to   = ( VecScatter_MPI_General*) ctx->todata;
388   PetscInt               m=A->rmap->n,n=B->cmap->n;
389 
390   PetscFunctionBegin;
391   ierr = MatCreate(((PetscObject)B)->comm,C);CHKERRQ(ierr);
392   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
393   ierr = MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);CHKERRQ(ierr);
394   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
395   ierr = MatMPIDenseSetPreallocation(*C,PETSC_NULL);CHKERRQ(ierr);
396   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
397   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
398   (*C)->ops->matmult = MatMatMult_MPIAIJ_MPIDense;
399 
400   ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr);
401   /* Create work matrix used to store off processor rows of B needed for local product */
402   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,PETSC_NULL,&contents->workB);CHKERRQ(ierr);
403   /* Create work arrays needed */
404   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
405                       B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
406                       from->n,MPI_Request,&contents->rwaits,
407                       to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr);
408 
409   ierr = PetscContainerCreate(((PetscObject)A)->comm,&container);CHKERRQ(ierr);
410   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
411   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
412   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
413   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
417 #undef __FUNCT__
418 #define __FUNCT__ "MatMPIDenseScatter"
419 /*
420     Performs an efficient scatter on the rows of B needed by this process; this is
421     a modification of the VecScatterBegin_() routines.
422 */
423 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
424 {
425   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
426   PetscErrorCode         ierr;
427   PetscScalar            *b,*w,*svalues,*rvalues;
428   VecScatter             ctx = aij->Mvctx;
429   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430   VecScatter_MPI_General *to   = ( VecScatter_MPI_General*) ctx->todata;
431   PetscInt               i,j,k;
432   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
433   PetscMPIInt            *sprocs,*rprocs,nrecvs;
434   MPI_Request            *swaits,*rwaits;
435   MPI_Comm               comm = ((PetscObject)A)->comm;
436   PetscMPIInt            tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
437   MPI_Status             status;
438   MPIAIJ_MPIDense        *contents;
439   PetscContainer         container;
440   Mat                    workB;
441 
442   PetscFunctionBegin;
443   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
444   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
445   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
446 
447   workB = *outworkB = contents->workB;
448   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
449   sindices  = to->indices;
450   sstarts   = to->starts;
451   sprocs    = to->procs;
452   swaits    = contents->swaits;
453   svalues   = contents->svalues;
454 
455   rindices  = from->indices;
456   rstarts   = from->starts;
457   rprocs    = from->procs;
458   rwaits    = contents->rwaits;
459   rvalues   = contents->rvalues;
460 
461   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
462   ierr = MatGetArray(workB,&w);CHKERRQ(ierr);
463 
464   for (i=0; i<from->n; i++) {
465     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
466   }
467 
468   for (i=0; i<to->n; i++) {
469     /* pack a message at a time */
470     CHKMEMQ;
471     for (j=0; j<sstarts[i+1]-sstarts[i]; j++){
472       for (k=0; k<ncols; k++) {
473         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
474       }
475     }
476     CHKMEMQ;
477     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
478   }
479 
480   nrecvs = from->n;
481   while (nrecvs) {
482     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
483     nrecvs--;
484     /* unpack a message at a time */
485     CHKMEMQ;
486     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++){
487       for (k=0; k<ncols; k++) {
488         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
489       }
490     }
491     CHKMEMQ;
492   }
493   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
494 
495   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
496   ierr = MatRestoreArray(workB,&w);CHKERRQ(ierr);
497   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
498   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
502 
503 #undef __FUNCT__
504 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
505 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
506 {
507   PetscErrorCode       ierr;
508   Mat_MPIAIJ           *aij = (Mat_MPIAIJ*)A->data;
509   Mat_MPIDense         *bdense = (Mat_MPIDense*)B->data;
510   Mat_MPIDense         *cdense = (Mat_MPIDense*)C->data;
511   Mat                  workB;
512 
513   PetscFunctionBegin;
514 
515   /* diagonal block of A times all local rows of B*/
516   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
517 
518   /* get off processor parts of B needed to complete the product */
519   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
520 
521   /* off-diagonal block of A times nonlocal rows of B */
522   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
523   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
524   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
525   PetscFunctionReturn(0);
526 }
527 
528 #undef __FUNCT__
529 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
530 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
531 {
532   PetscErrorCode     ierr;
533   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
534   Mat_SeqAIJ         *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
535   Mat_SeqAIJ         *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
536   PetscInt           *adi=ad->i,*adj,*aoi=ao->i,*aoj;
537   PetscScalar        *ada,*aoa,*cda=cd->a,*coa=co->a;
538   Mat_SeqAIJ         *p_loc,*p_oth;
539   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
540   PetscScalar        *pa_loc,*pa_oth,*pa,valtmp,*ca;
541   PetscInt           cm=C->rmap->n,anz,pnz;
542   Mat_PtAPMPI        *ptap=c->ptap;
543   PetscScalar        *apa_sparse=ptap->apa;
544   PetscInt           *api,*apj,*apJ,i,j,k,row;
545   PetscInt           cstart=C->cmap->rstart;
546   PetscInt           cdnz,conz,k0,k1,nextp;
547 
548   PetscFunctionBegin;
549   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
550   /*-----------------------------------------------------*/
551   /* update numerical values of P_oth and P_loc */
552   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
553   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
554 
555   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
556   /*----------------------------------------------------------*/
557   /* get data from symbolic products */
558   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
559   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
560   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
561   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
562 
563   api = ptap->api;
564   apj = ptap->apj;
565   for (i=0; i<cm; i++) {
566     apJ = apj + api[i];
567 
568     /* diagonal portion of A */
569     anz = adi[i+1] - adi[i];
570     adj = ad->j + adi[i];
571     ada = ad->a + adi[i];
572     for (j=0; j<anz; j++) {
573       row = adj[j];
574       pnz = pi_loc[row+1] - pi_loc[row];
575       pj  = pj_loc + pi_loc[row];
576       pa  = pa_loc + pi_loc[row];
577       /* perform sparse axpy */
578       valtmp = ada[j];
579       nextp  = 0;
580       for (k=0; nextp<pnz; k++) {
581         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
582           apa_sparse[k] += valtmp*pa[nextp++];
583         }
584       }
585       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
586     }
587 
588     /* off-diagonal portion of A */
589     anz = aoi[i+1] - aoi[i];
590     aoj = ao->j + aoi[i];
591     aoa = ao->a + aoi[i];
592     for (j=0; j<anz; j++) {
593       row = aoj[j];
594       pnz = pi_oth[row+1] - pi_oth[row];
595       pj  = pj_oth + pi_oth[row];
596       pa  = pa_oth + pi_oth[row];
597       /* perform sparse axpy */
598       valtmp = aoa[j];
599       nextp  = 0;
600       for (k=0; nextp<pnz; k++) {
601         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
602           apa_sparse[k] += valtmp*pa[nextp++];
603         }
604       }
605       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
606     }
607 
608     /* set values in C */
609     cdnz = cd->i[i+1] - cd->i[i];
610     conz = co->i[i+1] - co->i[i];
611 
612     /* 1st off-diagoanl part of C */
613     ca = coa + co->i[i];
614     k  = 0;
615     for (k0=0; k0<conz; k0++){
616       if (apJ[k] >= cstart) break;
617       ca[k0]      = apa_sparse[k];
618       apa_sparse[k] = 0.0;
619       k++;
620     }
621 
622     /* diagonal part of C */
623     ca = cda + cd->i[i];
624     for (k1=0; k1<cdnz; k1++){
625       ca[k1]      = apa_sparse[k];
626       apa_sparse[k] = 0.0;
627       k++;
628     }
629 
630     /* 2nd off-diagoanl part of C */
631     ca = coa + co->i[i];
632     for (; k0<conz; k0++){
633       ca[k0]      = apa_sparse[k];
634       apa_sparse[k] = 0.0;
635       k++;
636     }
637   }
638   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
644 #undef __FUNCT__
645 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
646 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
647 {
648   PetscErrorCode       ierr;
649   MPI_Comm             comm=((PetscObject)A)->comm;
650   Mat                  Cmpi;
651   Mat_PtAPMPI          *ptap;
652   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
653   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*c;
654   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
655   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
656   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
657   PetscInt             i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
658   PetscInt             am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
659   PetscInt             nlnk_max,armax,prmax;
660   PetscReal            afill;
661   PetscScalar          *apa;
662 
663   PetscFunctionBegin;
664   /* create struct Mat_PtAPMPI and attached it to C later */
665   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
666 
667   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
669 
670   /* get P_loc by taking all local rows of P */
671   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
672 
673   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
674   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
675   pi_loc = p_loc->i; pj_loc = p_loc->j;
676   pi_oth = p_oth->i; pj_oth = p_oth->j;
677 
678   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
679   /*-------------------------------------------------------------------*/
680   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
681   ptap->api = api;
682   api[0]    = 0;
683 
684   /* create and initialize a linked list */
685   armax = ad->rmax+ao->rmax;
686   prmax = PetscMax(p_loc->rmax,p_oth->rmax);
687   nlnk_max = armax*prmax;
688   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
689   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
690 
691   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
692   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
693   current_space = free_space;
694 
695   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
696   for (i=0; i<am; i++) {
697     apnz = 0;
698     /* diagonal portion of A */
699     nzi = adi[i+1] - adi[i];
700     for (j=0; j<nzi; j++){
701       row = *adj++;
702       pnz = pi_loc[row+1] - pi_loc[row];
703       Jptr  = pj_loc + pi_loc[row];
704       /* add non-zero cols of P into the sorted linked list lnk */
705       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
706     }
707     /* off-diagonal portion of A */
708     nzi = aoi[i+1] - aoi[i];
709     for (j=0; j<nzi; j++){
710       row = *aoj++;
711       pnz = pi_oth[row+1] - pi_oth[row];
712       Jptr  = pj_oth + pi_oth[row];
713       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
714     }
715 
716     apnz     = *lnk;
717     api[i+1] = api[i] + apnz;
718     if (apnz > apnz_max) apnz_max = apnz;
719 
720     /* if free space is not available, double the total space in the list */
721     if (current_space->local_remaining<apnz) {
722       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
723       nspacedouble++;
724     }
725 
726     /* Copy data into free space, then initialize lnk */
727     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
728     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
729     current_space->array           += apnz;
730     current_space->local_used      += apnz;
731     current_space->local_remaining -= apnz;
732   }
733 
734   /* Allocate space for apj, initialize apj, and */
735   /* destroy list of free space and other temporary array(s) */
736   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
737   apj  = ptap->apj;
738   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
739   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
740 
741   /* create and assemble symbolic parallel matrix Cmpi */
742   /*----------------------------------------------------*/
743   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
744   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
745   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
746   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
747   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
748   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
749   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
750 
751   /* malloc apa for assembly Cmpi */
752   ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
753   ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr);
754   ptap->apa = apa;
755   for (i=0; i<am; i++){
756     row  = i + rstart;
757     apnz = api[i+1] - api[i];
758     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
759     apj += apnz;
760   }
761   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
762   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
763 
764   ptap->destroy             = Cmpi->ops->destroy;
765   ptap->duplicate           = Cmpi->ops->duplicate;
766   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
767   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
768   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
769 
770   /* attach the supporting struct to Cmpi for reuse */
771   c = (Mat_MPIAIJ*)Cmpi->data;
772   c->ptap  = ptap;
773 
774   *C = Cmpi;
775 
776   /* set MatInfo */
777   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
778   if (afill < 1.0) afill = 1.0;
779   Cmpi->info.mallocs           = nspacedouble;
780   Cmpi->info.fill_ratio_given  = fill;
781   Cmpi->info.fill_ratio_needed = afill;
782 
783 #if defined(PETSC_USE_INFO)
784   if (api[am]) {
785     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
786     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
787   } else {
788     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
789   }
790 #endif
791   PetscFunctionReturn(0);
792 }
793 
794 /*-------------------------------------------------------------------------*/
795 #undef __FUNCT__
796 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
797 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
798 {
799   PetscErrorCode ierr;
800   PetscBool      scalable=PETSC_FALSE;
801 
802   PetscFunctionBegin;
803   if (scall == MAT_INITIAL_MATRIX){
804     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
805       ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
806       if  (scalable){
807         ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr);
808       } else {
809         ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
810       }
811     ierr = PetscOptionsEnd();CHKERRQ(ierr);
812   }
813   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
814   PetscFunctionReturn(0);
815 }
816 
817 #undef __FUNCT__
818 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
819 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
820 {
821   PetscErrorCode       ierr;
822   Mat_Merge_SeqsToMPI  *merge;
823   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
824   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
825   Mat_PtAPMPI          *ptap;
826   PetscInt             *adj,*aJ;
827   PetscInt             i,j,k,anz,pnz,row,*cj;
828   MatScalar            *ada,*aval,*ca,valtmp;
829   PetscInt             am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
830   MPI_Comm             comm=((PetscObject)C)->comm;
831   PetscMPIInt          size,rank,taga,*len_s;
832   PetscInt             *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
833   PetscInt             **buf_ri,**buf_rj;
834   PetscInt             cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
835   MPI_Request          *s_waits,*r_waits;
836   MPI_Status           *status;
837   MatScalar            **abuf_r,*ba_i,*pA,*coa,*ba;
838   PetscInt             *ai,*aj,*coi,*coj;
839   PetscInt             *poJ=po->j,*pdJ=pd->j;
840   Mat                  A_loc;
841   Mat_SeqAIJ           *a_loc;
842 
843   PetscFunctionBegin;
844   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
845   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
846 
847   ptap  = c->ptap;
848   merge = ptap->merge;
849 
850   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
851   /*--------------------------------------------------------------*/
852   /* get data from symbolic products */
853   coi = merge->coi; coj = merge->coj;
854   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
855   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
856 
857   bi     = merge->bi; bj = merge->bj;
858   owners = merge->rowmap->range;
859   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
860   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
861 
862   /* get A_loc by taking all local rows of A */
863   A_loc = ptap->A_loc;
864   ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
865   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
866   ai   = a_loc->i;
867   aj   = a_loc->j;
868 
869   ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */
870   ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
871 
872     for (i=0; i<am; i++) {
873       /* 2-a) put A[i,:] to dense array aval */
874       anz = ai[i+1] - ai[i];
875       adj = aj + ai[i];
876       ada = a_loc->a + ai[i];
877       for (j=0; j<anz; j++){
878         aval[adj[j]] = ada[j];
879       }
880 
881       /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
882       /*--------------------------------------------------------------*/
883       /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
884       pnz = po->i[i+1] - po->i[i];
885       poJ = po->j + po->i[i];
886       pA  = po->a + po->i[i];
887       for (j=0; j<pnz; j++){
888         row = poJ[j];
889         cnz = coi[row+1] - coi[row];
890         cj  = coj + coi[row];
891         ca  = coa + coi[row];
892         /* perform dense axpy */
893         valtmp = pA[j];
894         for (k=0; k<cnz; k++) {
895           ca[k] += valtmp*aval[cj[k]];
896         }
897         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
898       }
899 
900       /* put the value into Cd (diagonal part) */
901       pnz = pd->i[i+1] - pd->i[i];
902       pdJ = pd->j + pd->i[i];
903       pA  = pd->a + pd->i[i];
904       for (j=0; j<pnz; j++){
905         row = pdJ[j];
906         cnz = bi[row+1] - bi[row];
907         cj  = bj + bi[row];
908         ca  = ba + bi[row];
909         /* perform dense axpy */
910         valtmp = pA[j];
911         for (k=0; k<cnz; k++) {
912           ca[k] += valtmp*aval[cj[k]];
913         }
914         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
915       }
916 
917       /* zero the current row of Pt*A */
918       aJ = aj + ai[i];
919       for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
920     }
921 
922   /* 3) send and recv matrix values coa */
923   /*------------------------------------*/
924   buf_ri = merge->buf_ri;
925   buf_rj = merge->buf_rj;
926   len_s  = merge->len_s;
927   ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
928   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
929 
930   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
931   for (proc=0,k=0; proc<size; proc++){
932     if (!len_s[proc]) continue;
933     i = merge->owners_co[proc];
934     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
935     k++;
936   }
937   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
938   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
939 
940   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
941   ierr = PetscFree(r_waits);CHKERRQ(ierr);
942   ierr = PetscFree(coa);CHKERRQ(ierr);
943 
944   /* 4) insert local Cseq and received values into Cmpi */
945   /*----------------------------------------------------*/
946   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
947   for (k=0; k<merge->nrecv; k++){
948     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
949     nrows       = *(buf_ri_k[k]);
950     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
951     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
952   }
953 
954   for (i=0; i<cm; i++) {
955     row = owners[rank] + i; /* global row index of C_seq */
956     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
957     ba_i = ba + bi[i];
958     bnz  = bi[i+1] - bi[i];
959     /* add received vals into ba */
960     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
961       /* i-th row */
962       if (i == *nextrow[k]) {
963         cnz = *(nextci[k]+1) - *nextci[k];
964         cj  = buf_rj[k] + *(nextci[k]);
965         ca  = abuf_r[k] + *(nextci[k]);
966         nextcj = 0;
967         for (j=0; nextcj<cnz; j++){
968           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
969             ba_i[j] += ca[nextcj++];
970           }
971         }
972         nextrow[k]++; nextci[k]++;
973         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
974       }
975     }
976     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
977   }
978   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
979   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
980 
981   ierr = PetscFree(ba);CHKERRQ(ierr);
982   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
983   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
984   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
985   ierr = PetscFree(aval);CHKERRQ(ierr);
986   PetscFunctionReturn(0);
987 }
988 
989 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
990 #undef __FUNCT__
991 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
992 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
993 {
994   PetscErrorCode       ierr;
995   Mat                  Cmpi,A_loc,POt,PDt;
996   Mat_PtAPMPI          *ptap;
997   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
998   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c;
999   PetscInt             *pdti,*pdtj,*poti,*potj,*ptJ;
1000   PetscInt             nnz;
1001   PetscInt             *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1002   PetscInt             am=A->rmap->n,pn=P->cmap->n;
1003   PetscBT              lnkbt;
1004   MPI_Comm             comm=((PetscObject)A)->comm;
1005   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1006   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1007   PetscInt             len,proc,*dnz,*onz,*owners;
1008   PetscInt             nzi,*bi,*bj;
1009   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1010   MPI_Request          *swaits,*rwaits;
1011   MPI_Status           *sstatus,rstatus;
1012   Mat_Merge_SeqsToMPI  *merge;
1013   PetscInt             *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1014   PetscReal            afill=1.0,afill_tmp;
1015   PetscInt             rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1016   PetscScalar          *vals;
1017   Mat_SeqAIJ           *a_loc, *pdt,*pot;
1018 
1019   PetscFunctionBegin;
1020   /* check if matrix local sizes are compatible */
1021   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1022     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1023   }
1024 
1025   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1026   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1027 
1028   /* create struct Mat_PtAPMPI and attached it to C later */
1029   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1030 
1031   /* get A_loc by taking all local rows of A */
1032   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1033   ptap->A_loc = A_loc;
1034   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1035   ai   = a_loc->i;
1036   aj   = a_loc->j;
1037 
1038   /* determine symbolic Co=(p->B)^T*A - send to others */
1039   /*----------------------------------------------------*/
1040   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1041   pdt = (Mat_SeqAIJ*)PDt->data;
1042   pdti = pdt->i; pdtj = pdt->j;
1043 
1044   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1045   pot = (Mat_SeqAIJ*)POt->data;
1046   poti = pot->i; potj = pot->j;
1047 
1048   /* then, compute symbolic Co = (p->B)^T*A */
1049   pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1050                          >= (num of nonzero rows of C_seq) - pn */
1051   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1052   coi[0] = 0;
1053 
1054   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1055   nnz           = fill*(poti[pon] + ai[am]);
1056   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1057   current_space = free_space;
1058 
1059   /* create and initialize a linked list */
1060   i = PetscMax(pdt->rmax,pot->rmax);
1061   Crmax = i*a_loc->rmax*size;
1062   if (!Crmax || Crmax > aN) Crmax = aN;
1063   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1064 
1065   for (i=0; i<pon; i++) {
1066     pnz = poti[i+1] - poti[i];
1067     ptJ = potj + poti[i];
1068     for (j=0; j<pnz; j++){
1069       row  = ptJ[j]; /* row of A_loc == col of Pot */
1070       anz  = ai[row+1] - ai[row];
1071       Jptr = aj + ai[row];
1072       /* add non-zero cols of AP into the sorted linked list lnk */
1073       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1074     }
1075     nnz = lnk[0];
1076 
1077     /* If free space is not available, double the total space in the list */
1078     if (current_space->local_remaining<nnz) {
1079       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1080       nspacedouble++;
1081     }
1082 
1083     /* Copy data into free space, and zero out denserows */
1084     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1085     current_space->array           += nnz;
1086     current_space->local_used      += nnz;
1087     current_space->local_remaining -= nnz;
1088     coi[i+1] = coi[i] + nnz;
1089   }
1090 
1091   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1092   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1093   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1094   if (afill_tmp > afill) afill = afill_tmp;
1095 
1096   /* send j-array (coj) of Co to other processors */
1097   /*----------------------------------------------*/
1098   /* determine row ownership */
1099   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1100   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1101   merge->rowmap->n = pn;
1102   merge->rowmap->bs = 1;
1103   ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1104   owners = merge->rowmap->range;
1105 
1106   /* determine the number of messages to send, their lengths */
1107   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1108   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1109   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1110   len_s = merge->len_s;
1111   merge->nsend = 0;
1112 
1113   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1114   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1115 
1116   proc = 0;
1117   for (i=0; i<pon; i++){
1118     while (prmap[i] >= owners[proc+1]) proc++;
1119     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1120     len_s[proc] += coi[i+1] - coi[i];
1121   }
1122 
1123   len   = 0;  /* max length of buf_si[] */
1124   owners_co[0] = 0;
1125   for (proc=0; proc<size; proc++){
1126     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1127     if (len_si[proc]){
1128       merge->nsend++;
1129       len_si[proc] = 2*(len_si[proc] + 1);
1130       len += len_si[proc];
1131     }
1132   }
1133 
1134   /* determine the number and length of messages to receive for coi and coj  */
1135   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1136   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1137 
1138   /* post the Irecv and Isend of coj */
1139   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1140   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1141   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1142   for (proc=0, k=0; proc<size; proc++){
1143     if (!len_s[proc]) continue;
1144     i = owners_co[proc];
1145     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1146     k++;
1147   }
1148 
1149   /* receives and sends of coj are complete */
1150   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1151   for (i=0; i<merge->nrecv; i++){
1152     PetscMPIInt icompleted;
1153     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1154   }
1155   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1156   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1157 
1158   /* send and recv coi */
1159   /*-------------------*/
1160   ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1161   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1162   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1163   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1164   for (proc=0,k=0; proc<size; proc++){
1165     if (!len_s[proc]) continue;
1166     /* form outgoing message for i-structure:
1167          buf_si[0]:                 nrows to be sent
1168                [1:nrows]:           row index (global)
1169                [nrows+1:2*nrows+1]: i-structure index
1170     */
1171     /*-------------------------------------------*/
1172     nrows = len_si[proc]/2 - 1;
1173     buf_si_i    = buf_si + nrows+1;
1174     buf_si[0]   = nrows;
1175     buf_si_i[0] = 0;
1176     nrows = 0;
1177     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1178       nzi = coi[i+1] - coi[i];
1179       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1180       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1181       nrows++;
1182     }
1183     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1184     k++;
1185     buf_si += len_si[proc];
1186   }
1187   i = merge->nrecv;
1188   while (i--) {
1189     PetscMPIInt icompleted;
1190     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1191   }
1192   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1193   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1194   ierr = PetscFree(len_si);CHKERRQ(ierr);
1195   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1196   ierr = PetscFree(swaits);CHKERRQ(ierr);
1197   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1198   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1199 
1200   /* compute the local portion of C (mpi mat) */
1201   /*------------------------------------------*/
1202   /* allocate bi array and free space for accumulating nonzero column info */
1203   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1204   bi[0] = 0;
1205 
1206   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1207   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1208   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1209   current_space = free_space;
1210 
1211   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1212   for (k=0; k<merge->nrecv; k++){
1213     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1214     nrows       = *buf_ri_k[k];
1215     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1216     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1217   }
1218 
1219   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1220   rmax = 0;
1221   for (i=0; i<pn; i++) {
1222     /* add pdt[i,:]*AP into lnk */
1223     pnz = pdti[i+1] - pdti[i];
1224     ptJ = pdtj + pdti[i];
1225     for (j=0; j<pnz; j++){
1226       row  = ptJ[j];  /* row of AP == col of Pt */
1227       anz  = ai[row+1] - ai[row];
1228       Jptr = aj + ai[row];
1229       /* add non-zero cols of AP into the sorted linked list lnk */
1230       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1231     }
1232 
1233     /* add received col data into lnk */
1234     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1235       if (i == *nextrow[k]) { /* i-th row */
1236         nzi = *(nextci[k]+1) - *nextci[k];
1237         Jptr  = buf_rj[k] + *nextci[k];
1238         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1239         nextrow[k]++; nextci[k]++;
1240       }
1241     }
1242     nnz = lnk[0];
1243 
1244     /* if free space is not available, make more free space */
1245     if (current_space->local_remaining<nnz) {
1246       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1247       nspacedouble++;
1248     }
1249     /* copy data into free space, then initialize lnk */
1250     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1251     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1252     current_space->array           += nnz;
1253     current_space->local_used      += nnz;
1254     current_space->local_remaining -= nnz;
1255     bi[i+1] = bi[i] + nnz;
1256     if (nnz > rmax) rmax = nnz;
1257   }
1258   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1259 
1260   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1261   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1262   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1263   if (afill_tmp > afill) afill = afill_tmp;
1264   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1265   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1266   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1267 
1268   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1269   /*----------------------------------------------------------------------------------*/
1270   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1271   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1272 
1273   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1274   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1275   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1276   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1277   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1278   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1279   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1280   for (i=0; i<pn; i++){
1281     row = i + rstart;
1282     nnz = bi[i+1] - bi[i];
1283     Jptr = bj + bi[i];
1284     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1285   }
1286   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1287   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1288   ierr = PetscFree(vals);CHKERRQ(ierr);
1289 
1290   merge->bi            = bi;
1291   merge->bj            = bj;
1292   merge->coi           = coi;
1293   merge->coj           = coj;
1294   merge->buf_ri        = buf_ri;
1295   merge->buf_rj        = buf_rj;
1296   merge->owners_co     = owners_co;
1297   merge->destroy       = Cmpi->ops->destroy;
1298   merge->duplicate     = Cmpi->ops->duplicate;
1299 
1300   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1301   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1302 
1303   /* attach the supporting struct to Cmpi for reuse */
1304   c = (Mat_MPIAIJ*)Cmpi->data;
1305   c->ptap        = ptap;
1306   ptap->api      = PETSC_NULL;
1307   ptap->apj      = PETSC_NULL;
1308   ptap->merge    = merge;
1309   ptap->rmax     = rmax;
1310 
1311   *C = Cmpi;
1312 #if defined(PETSC_USE_INFO)
1313   if (bi[pn] != 0) {
1314     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1315     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1316   } else {
1317     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1318   }
1319 #endif
1320   PetscFunctionReturn(0);
1321 }
1322 
1323 #undef __FUNCT__
1324 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
1325 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1326 {
1327   PetscErrorCode       ierr;
1328   Mat_Merge_SeqsToMPI  *merge;
1329   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1330   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1331   Mat_PtAPMPI          *ptap;
1332   PetscInt             *adj;
1333   PetscInt             i,j,k,anz,pnz,row,*cj,nexta;
1334   MatScalar            *ada,*ca,valtmp;
1335   PetscInt             am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1336   MPI_Comm             comm=((PetscObject)C)->comm;
1337   PetscMPIInt          size,rank,taga,*len_s;
1338   PetscInt             *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1339   PetscInt             **buf_ri,**buf_rj;
1340   PetscInt             cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1341   MPI_Request          *s_waits,*r_waits;
1342   MPI_Status           *status;
1343   MatScalar            **abuf_r,*ba_i,*pA,*coa,*ba;
1344   PetscInt             *ai,*aj,*coi,*coj;
1345   PetscInt             *poJ=po->j,*pdJ=pd->j;
1346   Mat                  A_loc;
1347   Mat_SeqAIJ           *a_loc;
1348 
1349   PetscFunctionBegin;
1350   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1351   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1352 
1353   ptap  = c->ptap;
1354   merge = ptap->merge;
1355 
1356   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1357   /*------------------------------------------*/
1358   /* get data from symbolic products */
1359   coi = merge->coi; coj = merge->coj;
1360   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
1361   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
1362   bi     = merge->bi; bj = merge->bj;
1363   owners = merge->rowmap->range;
1364   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1365   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1366 
1367   /* get A_loc by taking all local rows of A */
1368   A_loc = ptap->A_loc;
1369   ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1370   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1371   ai   = a_loc->i;
1372   aj   = a_loc->j;
1373 
1374   for (i=0; i<am; i++) {
1375     /* 2-a) put A[i,:] to dense array aval */
1376     anz = ai[i+1] - ai[i];
1377     adj = aj + ai[i];
1378     ada = a_loc->a + ai[i];
1379 
1380     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1381     /*-------------------------------------------------------------*/
1382     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1383     pnz = po->i[i+1] - po->i[i];
1384     poJ = po->j + po->i[i];
1385     pA  = po->a + po->i[i];
1386     for (j=0; j<pnz; j++){
1387       row = poJ[j];
1388       cnz = coi[row+1] - coi[row];
1389       cj  = coj + coi[row];
1390       ca  = coa + coi[row];
1391       /* perform sparse axpy */
1392       nexta  = 0;
1393       valtmp = pA[j];
1394       for (k=0; nexta<anz; k++) {
1395         if (cj[k] == adj[nexta]){
1396           ca[k] += valtmp*ada[nexta];
1397           nexta++;
1398         }
1399       }
1400       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1401     }
1402 
1403     /* put the value into Cd (diagonal part) */
1404     pnz = pd->i[i+1] - pd->i[i];
1405     pdJ = pd->j + pd->i[i];
1406     pA  = pd->a + pd->i[i];
1407     for (j=0; j<pnz; j++){
1408       row = pdJ[j];
1409       cnz = bi[row+1] - bi[row];
1410       cj  = bj + bi[row];
1411       ca  = ba + bi[row];
1412       /* perform sparse axpy */
1413       nexta  = 0;
1414       valtmp = pA[j];
1415       for (k=0; nexta<anz; k++) {
1416         if (cj[k] == adj[nexta]){
1417           ca[k] += valtmp*ada[nexta];
1418           nexta++;
1419         }
1420       }
1421       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1422     }
1423 
1424   }
1425 
1426   /* 3) send and recv matrix values coa */
1427   /*------------------------------------*/
1428   buf_ri = merge->buf_ri;
1429   buf_rj = merge->buf_rj;
1430   len_s  = merge->len_s;
1431   ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1432   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1433 
1434   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
1435   for (proc=0,k=0; proc<size; proc++){
1436     if (!len_s[proc]) continue;
1437     i = merge->owners_co[proc];
1438     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1439     k++;
1440   }
1441   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1442   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1443 
1444   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1445   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1446   ierr = PetscFree(coa);CHKERRQ(ierr);
1447 
1448   /* 4) insert local Cseq and received values into Cmpi */
1449   /*----------------------------------------------------*/
1450   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1451   for (k=0; k<merge->nrecv; k++){
1452     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1453     nrows       = *(buf_ri_k[k]);
1454     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1455     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1456   }
1457 
1458   for (i=0; i<cm; i++) {
1459     row = owners[rank] + i; /* global row index of C_seq */
1460     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1461     ba_i = ba + bi[i];
1462     bnz  = bi[i+1] - bi[i];
1463     /* add received vals into ba */
1464     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1465       /* i-th row */
1466       if (i == *nextrow[k]) {
1467         cnz = *(nextci[k]+1) - *nextci[k];
1468         cj  = buf_rj[k] + *(nextci[k]);
1469         ca  = abuf_r[k] + *(nextci[k]);
1470         nextcj = 0;
1471         for (j=0; nextcj<cnz; j++){
1472           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
1473             ba_i[j] += ca[nextcj++];
1474           }
1475         }
1476         nextrow[k]++; nextci[k]++;
1477         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1478       }
1479     }
1480     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1481   }
1482   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1483   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1484 
1485   ierr = PetscFree(ba);CHKERRQ(ierr);
1486   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1487   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1488   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1493 #undef __FUNCT__
1494 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
1495 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1496 {
1497   PetscErrorCode       ierr;
1498   Mat                  Cmpi,A_loc,POt,PDt;
1499   Mat_PtAPMPI          *ptap;
1500   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
1501   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c;
1502   PetscInt             *pdti,*pdtj,*poti,*potj,*ptJ;
1503   PetscInt             nnz;
1504   PetscInt             *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1505   PetscInt             am=A->rmap->n,pn=P->cmap->n;
1506   MPI_Comm             comm=((PetscObject)A)->comm;
1507   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1508   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1509   PetscInt             len,proc,*dnz,*onz,*owners;
1510   PetscInt             nzi,*bi,*bj;
1511   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1512   MPI_Request          *swaits,*rwaits;
1513   MPI_Status           *sstatus,rstatus;
1514   Mat_Merge_SeqsToMPI  *merge;
1515   PetscInt             *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1516   PetscReal            afill=1.0,afill_tmp;
1517   PetscInt             rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1518   PetscScalar          *vals;
1519   Mat_SeqAIJ           *a_loc, *pdt,*pot;
1520 
1521   PetscFunctionBegin;
1522   /* check if matrix local sizes are compatible */
1523   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1524     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1525   }
1526 
1527   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1528   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1529 
1530   /* create struct Mat_PtAPMPI and attached it to C later */
1531   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1532 
1533   /* get A_loc by taking all local rows of A */
1534   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1535   ptap->A_loc = A_loc;
1536   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1537   ai   = a_loc->i;
1538   aj   = a_loc->j;
1539 
1540   /* determine symbolic Co=(p->B)^T*A - send to others */
1541   /*----------------------------------------------------*/
1542   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1543   pdt = (Mat_SeqAIJ*)PDt->data;
1544   pdti = pdt->i; pdtj = pdt->j;
1545 
1546   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1547   pot = (Mat_SeqAIJ*)POt->data;
1548   poti = pot->i; potj = pot->j;
1549 
1550   /* then, compute symbolic Co = (p->B)^T*A */
1551   pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1552                          >= (num of nonzero rows of C_seq) - pn */
1553   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1554   coi[0] = 0;
1555 
1556   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1557   nnz           = fill*(poti[pon] + ai[am]);
1558   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1559   current_space = free_space;
1560 
1561   /* create and initialize a linked list */
1562   i = PetscMax(pdt->rmax,pot->rmax);
1563   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1564   if (!Crmax || Crmax > aN) Crmax = aN;
1565   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1566 
1567   for (i=0; i<pon; i++) {
1568     nnz = 0;
1569     pnz = poti[i+1] - poti[i];
1570     ptJ = potj + poti[i];
1571     for (j=0; j<pnz; j++){
1572       row  = ptJ[j]; /* row of A_loc == col of Pot */
1573       anz  = ai[row+1] - ai[row];
1574       Jptr = aj + ai[row];
1575       /* add non-zero cols of AP into the sorted linked list lnk */
1576       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1577     }
1578     nnz = lnk[0];
1579 
1580     /* If free space is not available, double the total space in the list */
1581     if (current_space->local_remaining<nnz) {
1582       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1583       nspacedouble++;
1584     }
1585 
1586     /* Copy data into free space, and zero out denserows */
1587     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1588     current_space->array           += nnz;
1589     current_space->local_used      += nnz;
1590     current_space->local_remaining -= nnz;
1591     coi[i+1] = coi[i] + nnz;
1592   }
1593 
1594   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1595   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1596   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1597   if (afill_tmp > afill) afill = afill_tmp;
1598 
1599   /* send j-array (coj) of Co to other processors */
1600   /*----------------------------------------------*/
1601   /* determine row ownership */
1602   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1603   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1604   merge->rowmap->n = pn;
1605   merge->rowmap->bs = 1;
1606   ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1607   owners = merge->rowmap->range;
1608 
1609   /* determine the number of messages to send, their lengths */
1610   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1611   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1612   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1613   len_s = merge->len_s;
1614   merge->nsend = 0;
1615 
1616   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1617   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1618 
1619   proc = 0;
1620   for (i=0; i<pon; i++){
1621     while (prmap[i] >= owners[proc+1]) proc++;
1622     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1623     len_s[proc] += coi[i+1] - coi[i];
1624   }
1625 
1626   len   = 0;  /* max length of buf_si[] */
1627   owners_co[0] = 0;
1628   for (proc=0; proc<size; proc++){
1629     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1630     if (len_si[proc]){
1631       merge->nsend++;
1632       len_si[proc] = 2*(len_si[proc] + 1);
1633       len += len_si[proc];
1634     }
1635   }
1636 
1637   /* determine the number and length of messages to receive for coi and coj  */
1638   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1639   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1640 
1641   /* post the Irecv and Isend of coj */
1642   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1643   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1644   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1645   for (proc=0, k=0; proc<size; proc++){
1646     if (!len_s[proc]) continue;
1647     i = owners_co[proc];
1648     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1649     k++;
1650   }
1651 
1652   /* receives and sends of coj are complete */
1653   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1654   for (i=0; i<merge->nrecv; i++){
1655     PetscMPIInt icompleted;
1656     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1657   }
1658   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1659   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1660 
1661   /* send and recv coi */
1662   /*-------------------*/
1663   ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1664   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1665   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1666   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1667   for (proc=0,k=0; proc<size; proc++){
1668     if (!len_s[proc]) continue;
1669     /* form outgoing message for i-structure:
1670          buf_si[0]:                 nrows to be sent
1671                [1:nrows]:           row index (global)
1672                [nrows+1:2*nrows+1]: i-structure index
1673     */
1674     /*-------------------------------------------*/
1675     nrows = len_si[proc]/2 - 1;
1676     buf_si_i    = buf_si + nrows+1;
1677     buf_si[0]   = nrows;
1678     buf_si_i[0] = 0;
1679     nrows = 0;
1680     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1681       nzi = coi[i+1] - coi[i];
1682       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1683       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1684       nrows++;
1685     }
1686     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1687     k++;
1688     buf_si += len_si[proc];
1689   }
1690   i = merge->nrecv;
1691   while (i--) {
1692     PetscMPIInt icompleted;
1693     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1694   }
1695   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1696   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1697   ierr = PetscFree(len_si);CHKERRQ(ierr);
1698   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1699   ierr = PetscFree(swaits);CHKERRQ(ierr);
1700   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1701   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1702 
1703   /* compute the local portion of C (mpi mat) */
1704   /*------------------------------------------*/
1705   /* allocate bi array and free space for accumulating nonzero column info */
1706   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1707   bi[0] = 0;
1708 
1709   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1710   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1711   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1712   current_space = free_space;
1713 
1714   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1715   for (k=0; k<merge->nrecv; k++){
1716     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1717     nrows       = *buf_ri_k[k];
1718     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1719     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1720   }
1721 
1722   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1723   rmax = 0;
1724   for (i=0; i<pn; i++) {
1725     /* add pdt[i,:]*AP into lnk */
1726     pnz = pdti[i+1] - pdti[i];
1727     ptJ = pdtj + pdti[i];
1728     for (j=0; j<pnz; j++){
1729       row  = ptJ[j];  /* row of AP == col of Pt */
1730       anz  = ai[row+1] - ai[row];
1731       Jptr = aj + ai[row];
1732       /* add non-zero cols of AP into the sorted linked list lnk */
1733       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1734     }
1735 
1736     /* add received col data into lnk */
1737     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1738       if (i == *nextrow[k]) { /* i-th row */
1739         nzi = *(nextci[k]+1) - *nextci[k];
1740         Jptr  = buf_rj[k] + *nextci[k];
1741         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1742         nextrow[k]++; nextci[k]++;
1743       }
1744     }
1745     nnz = lnk[0];
1746 
1747     /* if free space is not available, make more free space */
1748     if (current_space->local_remaining<nnz) {
1749       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1750       nspacedouble++;
1751     }
1752     /* copy data into free space, then initialize lnk */
1753     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1754     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1755     current_space->array           += nnz;
1756     current_space->local_used      += nnz;
1757     current_space->local_remaining -= nnz;
1758     bi[i+1] = bi[i] + nnz;
1759     if (nnz > rmax) rmax = nnz;
1760   }
1761   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1762 
1763   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1764   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1765   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1766   if (afill_tmp > afill) afill = afill_tmp;
1767   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1768   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1769   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1770 
1771   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1772   /*----------------------------------------------------------------------------------*/
1773   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1774   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1775 
1776   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1777   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1778   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs); CHKERRQ(ierr);
1779   ierr = MatSetType(Cmpi,MATMPIAIJ); CHKERRQ(ierr);
1780   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1781   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1782   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1783   for (i=0; i<pn; i++){
1784     row = i + rstart;
1785     nnz = bi[i+1] - bi[i];
1786     Jptr = bj + bi[i];
1787     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1788   }
1789   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1790   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1791   ierr = PetscFree(vals);CHKERRQ(ierr);
1792 
1793   merge->bi            = bi;
1794   merge->bj            = bj;
1795   merge->coi           = coi;
1796   merge->coj           = coj;
1797   merge->buf_ri        = buf_ri;
1798   merge->buf_rj        = buf_rj;
1799   merge->owners_co     = owners_co;
1800   merge->destroy       = Cmpi->ops->destroy;
1801   merge->duplicate     = Cmpi->ops->duplicate;
1802 
1803   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1804   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1805 
1806   /* attach the supporting struct to Cmpi for reuse */
1807   c = (Mat_MPIAIJ*)Cmpi->data;
1808   c->ptap        = ptap;
1809   ptap->api      = PETSC_NULL;
1810   ptap->apj      = PETSC_NULL;
1811   ptap->merge    = merge;
1812   ptap->rmax     = rmax;
1813   ptap->apa      = PETSC_NULL;
1814 
1815   *C = Cmpi;
1816 #if defined(PETSC_USE_INFO)
1817   if (bi[pn] != 0) {
1818     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1819     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1820   } else {
1821     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1822   }
1823 #endif
1824   PetscFunctionReturn(0);
1825 }
1826