xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision e52336cb213eafb6e0bc1e6a0d4ebe153339e097)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20   MPI_Comm       comm;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
25     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
26 
27     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
28     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
29     ierr = PetscOptionsEnd();CHKERRQ(ierr);
30 
31     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
32     switch (alg) {
33     case 1:
34       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
35       break;
36     default:
37       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
38       break;
39     }
40     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
41   }
42   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
43   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
44   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   PetscFunctionReturn(0);
46 }
47 
48 #undef __FUNCT__
49 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
50 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51 {
52   PetscErrorCode ierr;
53   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
54   Mat_PtAPMPI    *ptap = a->ptap;
55 
56   PetscFunctionBegin;
57   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
59   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
60   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
61   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
62   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
63   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
64   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
65   ierr = ptap->destroy(A);CHKERRQ(ierr);
66   ierr = PetscFree(ptap);CHKERRQ(ierr);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
72 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73 {
74   PetscErrorCode ierr;
75   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
76   Mat_PtAPMPI    *ptap = a->ptap;
77 
78   PetscFunctionBegin;
79   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
80 
81   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83   PetscFunctionReturn(0);
84 }
85 
86 #undef __FUNCT__
87 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
88 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89 {
90   PetscErrorCode ierr;
91   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94   PetscScalar    *cda=cd->a,*coa=co->a;
95   Mat_SeqAIJ     *p_loc,*p_oth;
96   PetscScalar    *apa,*ca;
97   PetscInt       cm   =C->rmap->n;
98   Mat_PtAPMPI    *ptap=c->ptap;
99   PetscInt       *api,*apj,*apJ,i,k;
100   PetscInt       cstart=C->cmap->rstart;
101   PetscInt       cdnz,conz,k0,k1;
102   MPI_Comm       comm;
103   PetscMPIInt    size;
104 
105   PetscFunctionBegin;
106   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
107   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
108 
109   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
110   /*-----------------------------------------------------*/
111   /* update numerical values of P_oth and P_loc */
112   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
113   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
114 
115   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116   /*----------------------------------------------------------*/
117   /* get data from symbolic products */
118   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119   p_oth = NULL;
120   if (size >1) {
121     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122   }
123 
124   /* get apa for storing dense row A[i,:]*P */
125   apa = ptap->apa;
126 
127   api = ptap->api;
128   apj = ptap->apj;
129   for (i=0; i<cm; i++) {
130     /* compute apa = A[i,:]*P */
131     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
132 
133     /* set values in C */
134     apJ  = apj + api[i];
135     cdnz = cd->i[i+1] - cd->i[i];
136     conz = co->i[i+1] - co->i[i];
137 
138     /* 1st off-diagoanl part of C */
139     ca = coa + co->i[i];
140     k  = 0;
141     for (k0=0; k0<conz; k0++) {
142       if (apJ[k] >= cstart) break;
143       ca[k0]      = apa[apJ[k]];
144       apa[apJ[k]] = 0.0;
145       k++;
146     }
147 
148     /* diagonal part of C */
149     ca = cda + cd->i[i];
150     for (k1=0; k1<cdnz; k1++) {
151       ca[k1]      = apa[apJ[k]];
152       apa[apJ[k]] = 0.0;
153       k++;
154     }
155 
156     /* 2nd off-diagoanl part of C */
157     ca = coa + co->i[i];
158     for (; k0<conz; k0++) {
159       ca[k0]      = apa[apJ[k]];
160       apa[apJ[k]] = 0.0;
161       k++;
162     }
163   }
164   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
165   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
171 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172 {
173   PetscErrorCode     ierr;
174   MPI_Comm           comm;
175   PetscMPIInt        size;
176   Mat                Cmpi;
177   Mat_PtAPMPI        *ptap;
178   PetscFreeSpaceList free_space=NULL,current_space=NULL;
179   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
180   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185   PetscBT            lnkbt;
186   PetscScalar        *apa;
187   PetscReal          afill;
188   PetscTable         ta;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
192   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
193 
194   /* create struct Mat_PtAPMPI and attached it to C later */
195   ierr = PetscNew(&ptap);CHKERRQ(ierr);
196 
197   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
199 
200   /* get P_loc by taking all local rows of P */
201   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
202 
203   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204   pi_loc = p_loc->i; pj_loc = p_loc->j;
205   if (size > 1) {
206     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207     pi_oth = p_oth->i; pj_oth = p_oth->j;
208   } else {
209     p_oth = NULL;
210     pi_oth = NULL; pj_oth = NULL;
211   }
212 
213   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214   /*-------------------------------------------------------------------*/
215   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
216   ptap->api = api;
217   api[0]    = 0;
218 
219   /* create and initialize a linked list */
220   Crmax = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN));
221   if (Crmax > pN) Crmax = pN;
222   ierr = PetscTableCreate(Crmax,pN,&ta);CHKERRQ(ierr);
223   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
224   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
225   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
226   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
227 
228   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
229 
230   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
231   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
232   current_space = free_space;
233 
234   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
235   for (i=0; i<am; i++) {
236     /* diagonal portion of A */
237     nzi = adi[i+1] - adi[i];
238     for (j=0; j<nzi; j++) {
239       row  = *adj++;
240       pnz  = pi_loc[row+1] - pi_loc[row];
241       Jptr = pj_loc + pi_loc[row];
242       /* add non-zero cols of P into the sorted linked list lnk */
243       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
244     }
245     /* off-diagonal portion of A */
246     nzi = aoi[i+1] - aoi[i];
247     for (j=0; j<nzi; j++) {
248       row  = *aoj++;
249       pnz  = pi_oth[row+1] - pi_oth[row];
250       Jptr = pj_oth + pi_oth[row];
251       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
252     }
253 
254     apnz     = lnk[0];
255     api[i+1] = api[i] + apnz;
256 
257     /* if free space is not available, double the total space in the list */
258     if (current_space->local_remaining<apnz) {
259       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
260       nspacedouble++;
261     }
262 
263     /* Copy data into free space, then initialize lnk */
264     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
265     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
266 
267     current_space->array           += apnz;
268     current_space->local_used      += apnz;
269     current_space->local_remaining -= apnz;
270   }
271 
272   /* Allocate space for apj, initialize apj, and */
273   /* destroy list of free space and other temporary array(s) */
274   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
275   apj  = ptap->apj;
276   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
277   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
278 
279   /* malloc apa to store dense row A[i,:]*P */
280   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
281 
282   ptap->apa = apa;
283 
284   /* create and assemble symbolic parallel matrix Cmpi */
285   /*----------------------------------------------------*/
286   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
287   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
288   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
289 
290   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
291   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
292   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
293   for (i=0; i<am; i++) {
294     row  = i + rstart;
295     apnz = api[i+1] - api[i];
296     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
297     apj += apnz;
298   }
299   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
300   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301 
302   ptap->destroy        = Cmpi->ops->destroy;
303   ptap->duplicate      = Cmpi->ops->duplicate;
304   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
305   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
306   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
307 
308   /* attach the supporting struct to Cmpi for reuse */
309   c       = (Mat_MPIAIJ*)Cmpi->data;
310   c->ptap = ptap;
311 
312   *C = Cmpi;
313 
314   /* set MatInfo */
315   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
316   if (afill < 1.0) afill = 1.0;
317   Cmpi->info.mallocs           = nspacedouble;
318   Cmpi->info.fill_ratio_given  = fill;
319   Cmpi->info.fill_ratio_needed = afill;
320 
321 #if defined(PETSC_USE_INFO)
322   if (api[am]) {
323     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
324     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
325   } else {
326     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
327   }
328 #endif
329   PetscFunctionReturn(0);
330 }
331 
332 #undef __FUNCT__
333 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
334 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
335 {
336   PetscErrorCode ierr;
337 
338   PetscFunctionBegin;
339   if (scall == MAT_INITIAL_MATRIX) {
340     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
341     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
342     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
343   }
344   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
345   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
346   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 typedef struct {
351   Mat         workB;
352   PetscScalar *rvalues,*svalues;
353   MPI_Request *rwaits,*swaits;
354 } MPIAIJ_MPIDense;
355 
356 #undef __FUNCT__
357 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
358 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
359 {
360   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
361   PetscErrorCode  ierr;
362 
363   PetscFunctionBegin;
364   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
365   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
366   ierr = PetscFree(contents);CHKERRQ(ierr);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
372 /*
373     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
374   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
375 
376   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
377 */
378 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
379 {
380   PetscErrorCode         ierr;
381   PetscBool              flg;
382   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
383   PetscInt               nz   = aij->B->cmap->n;
384   PetscContainer         container;
385   MPIAIJ_MPIDense        *contents;
386   VecScatter             ctx   = aij->Mvctx;
387   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
388   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
389 
390   PetscFunctionBegin;
391   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
392   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
393 
394   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
395   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
396   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
397 
398   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
399 
400   ierr = PetscNew(&contents);CHKERRQ(ierr);
401   /* Create work matrix used to store off processor rows of B needed for local product */
402   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
403   /* Create work arrays needed */
404   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
405                       B->cmap->N*to->starts[to->n],&contents->svalues,
406                       from->n,&contents->rwaits,
407                       to->n,&contents->swaits);CHKERRQ(ierr);
408 
409   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
410   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
411   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
412   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
413   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
414 
415   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNCT__
420 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
421 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
422 {
423   PetscErrorCode         ierr;
424   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
425   PetscInt               nz   = aij->B->cmap->n;
426   PetscContainer         container;
427   MPIAIJ_MPIDense        *contents;
428   VecScatter             ctx   = aij->Mvctx;
429   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
431   PetscInt               m     = A->rmap->n,n=B->cmap->n;
432 
433   PetscFunctionBegin;
434   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
435   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
436   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
437   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
438   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
439   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
440   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
441 
442   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
443 
444   ierr = PetscNew(&contents);CHKERRQ(ierr);
445   /* Create work matrix used to store off processor rows of B needed for local product */
446   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
447   /* Create work arrays needed */
448   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
449                       B->cmap->N*to->starts[to->n],&contents->svalues,
450                       from->n,&contents->rwaits,
451                       to->n,&contents->swaits);CHKERRQ(ierr);
452 
453   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
454   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
455   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
456   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
457   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 #undef __FUNCT__
462 #define __FUNCT__ "MatMPIDenseScatter"
463 /*
464     Performs an efficient scatter on the rows of B needed by this process; this is
465     a modification of the VecScatterBegin_() routines.
466 */
467 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
468 {
469   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
470   PetscErrorCode         ierr;
471   PetscScalar            *b,*w,*svalues,*rvalues;
472   VecScatter             ctx   = aij->Mvctx;
473   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
474   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
475   PetscInt               i,j,k;
476   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
477   PetscMPIInt            *sprocs,*rprocs,nrecvs;
478   MPI_Request            *swaits,*rwaits;
479   MPI_Comm               comm;
480   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
481   MPI_Status             status;
482   MPIAIJ_MPIDense        *contents;
483   PetscContainer         container;
484   Mat                    workB;
485 
486   PetscFunctionBegin;
487   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
488   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
489   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
490   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
491 
492   workB = *outworkB = contents->workB;
493   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
494   sindices = to->indices;
495   sstarts  = to->starts;
496   sprocs   = to->procs;
497   swaits   = contents->swaits;
498   svalues  = contents->svalues;
499 
500   rindices = from->indices;
501   rstarts  = from->starts;
502   rprocs   = from->procs;
503   rwaits   = contents->rwaits;
504   rvalues  = contents->rvalues;
505 
506   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
507   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
508 
509   for (i=0; i<from->n; i++) {
510     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
511   }
512 
513   for (i=0; i<to->n; i++) {
514     /* pack a message at a time */
515     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
516       for (k=0; k<ncols; k++) {
517         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
518       }
519     }
520     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
521   }
522 
523   nrecvs = from->n;
524   while (nrecvs) {
525     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
526     nrecvs--;
527     /* unpack a message at a time */
528     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
529       for (k=0; k<ncols; k++) {
530         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
531       }
532     }
533   }
534   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
535 
536   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
537   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
538   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
539   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
540   PetscFunctionReturn(0);
541 }
542 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
543 
544 #undef __FUNCT__
545 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
546 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
547 {
548   PetscErrorCode ierr;
549   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
550   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
551   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
552   Mat            workB;
553 
554   PetscFunctionBegin;
555   /* diagonal block of A times all local rows of B*/
556   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
557 
558   /* get off processor parts of B needed to complete the product */
559   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
560 
561   /* off-diagonal block of A times nonlocal rows of B */
562   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
563   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
564   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
565   PetscFunctionReturn(0);
566 }
567 
568 #undef __FUNCT__
569 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
570 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
571 {
572   PetscErrorCode ierr;
573   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
574   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
575   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
576   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
577   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
578   Mat_SeqAIJ     *p_loc,*p_oth;
579   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
580   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
581   PetscInt       cm          = C->rmap->n,anz,pnz;
582   Mat_PtAPMPI    *ptap       = c->ptap;
583   PetscScalar    *apa_sparse = ptap->apa;
584   PetscInt       *api,*apj,*apJ,i,j,k,row;
585   PetscInt       cstart = C->cmap->rstart;
586   PetscInt       cdnz,conz,k0,k1,nextp;
587   MPI_Comm       comm;
588   PetscMPIInt    size;
589 
590   PetscFunctionBegin;
591   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
592   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
593 
594   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
595   /*-----------------------------------------------------*/
596   /* update numerical values of P_oth and P_loc */
597   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
598   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
599 
600   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
601   /*----------------------------------------------------------*/
602   /* get data from symbolic products */
603   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
604   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
605   if (size >1) {
606     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
607     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
608   } else {
609     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
610   }
611 
612   api = ptap->api;
613   apj = ptap->apj;
614   for (i=0; i<cm; i++) {
615     apJ = apj + api[i];
616 
617     /* diagonal portion of A */
618     anz = adi[i+1] - adi[i];
619     adj = ad->j + adi[i];
620     ada = ad->a + adi[i];
621     for (j=0; j<anz; j++) {
622       row = adj[j];
623       pnz = pi_loc[row+1] - pi_loc[row];
624       pj  = pj_loc + pi_loc[row];
625       pa  = pa_loc + pi_loc[row];
626       /* perform sparse axpy */
627       valtmp = ada[j];
628       nextp  = 0;
629       for (k=0; nextp<pnz; k++) {
630         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
631           apa_sparse[k] += valtmp*pa[nextp++];
632         }
633       }
634       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
635     }
636 
637     /* off-diagonal portion of A */
638     anz = aoi[i+1] - aoi[i];
639     aoj = ao->j + aoi[i];
640     aoa = ao->a + aoi[i];
641     for (j=0; j<anz; j++) {
642       row = aoj[j];
643       pnz = pi_oth[row+1] - pi_oth[row];
644       pj  = pj_oth + pi_oth[row];
645       pa  = pa_oth + pi_oth[row];
646       /* perform sparse axpy */
647       valtmp = aoa[j];
648       nextp  = 0;
649       for (k=0; nextp<pnz; k++) {
650         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
651           apa_sparse[k] += valtmp*pa[nextp++];
652         }
653       }
654       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
655     }
656 
657     /* set values in C */
658     cdnz = cd->i[i+1] - cd->i[i];
659     conz = co->i[i+1] - co->i[i];
660 
661     /* 1st off-diagoanl part of C */
662     ca = coa + co->i[i];
663     k  = 0;
664     for (k0=0; k0<conz; k0++) {
665       if (apJ[k] >= cstart) break;
666       ca[k0]        = apa_sparse[k];
667       apa_sparse[k] = 0.0;
668       k++;
669     }
670 
671     /* diagonal part of C */
672     ca = cda + cd->i[i];
673     for (k1=0; k1<cdnz; k1++) {
674       ca[k1]        = apa_sparse[k];
675       apa_sparse[k] = 0.0;
676       k++;
677     }
678 
679     /* 2nd off-diagoanl part of C */
680     ca = coa + co->i[i];
681     for (; k0<conz; k0++) {
682       ca[k0]        = apa_sparse[k];
683       apa_sparse[k] = 0.0;
684       k++;
685     }
686   }
687   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
688   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
689   PetscFunctionReturn(0);
690 }
691 
692 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
693 #undef __FUNCT__
694 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
695 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
696 {
697   PetscErrorCode     ierr;
698   MPI_Comm           comm;
699   PetscMPIInt        size;
700   Mat                Cmpi;
701   Mat_PtAPMPI        *ptap;
702   PetscFreeSpaceList free_space = NULL,current_space=NULL;
703   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
704   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
705   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
706   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
707   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
708   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
709   PetscReal          afill;
710   PetscScalar        *apa;
711   PetscTable         ta;
712 
713   PetscFunctionBegin;
714   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
715   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
716 
717   /* create struct Mat_PtAPMPI and attached it to C later */
718   ierr = PetscNew(&ptap);CHKERRQ(ierr);
719 
720   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
721   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
722 
723   /* get P_loc by taking all local rows of P */
724   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
725 
726   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
727   pi_loc = p_loc->i; pj_loc = p_loc->j;
728   if (size > 1) {
729     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
730     pi_oth = p_oth->i; pj_oth = p_oth->j;
731   } else {
732     p_oth  = NULL;
733     pi_oth = NULL; pj_oth = NULL;
734   }
735 
736   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
737   /*-------------------------------------------------------------------*/
738   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
739   ptap->api = api;
740   api[0]    = 0;
741 
742   /* create and initialize a linked list */
743   Crmax = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected Crmax */
744   if (Crmax > pN) Crmax = pN;
745   ierr = PetscTableCreate(Crmax,pN,&ta);CHKERRQ(ierr);
746   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
747   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
748   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
749   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
750 
751   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
752 
753   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
754   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
755 
756   current_space = free_space;
757 
758   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
759   for (i=0; i<am; i++) {
760     /* diagonal portion of A */
761     nzi = adi[i+1] - adi[i];
762     for (j=0; j<nzi; j++) {
763       row  = *adj++;
764       pnz  = pi_loc[row+1] - pi_loc[row];
765       Jptr = pj_loc + pi_loc[row];
766       /* add non-zero cols of P into the sorted linked list lnk */
767       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
768     }
769     /* off-diagonal portion of A */
770     nzi = aoi[i+1] - aoi[i];
771     for (j=0; j<nzi; j++) {
772       row  = *aoj++;
773       pnz  = pi_oth[row+1] - pi_oth[row];
774       Jptr = pj_oth + pi_oth[row];
775       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
776     }
777 
778     apnz     = *lnk;
779     api[i+1] = api[i] + apnz;
780     if (apnz > apnz_max) apnz_max = apnz;
781 
782     /* if free space is not available, double the total space in the list */
783     if (current_space->local_remaining<apnz) {
784       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
785       nspacedouble++;
786     }
787 
788     /* Copy data into free space, then initialize lnk */
789     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
790     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
791 
792     current_space->array           += apnz;
793     current_space->local_used      += apnz;
794     current_space->local_remaining -= apnz;
795   }
796 
797   /* Allocate space for apj, initialize apj, and */
798   /* destroy list of free space and other temporary array(s) */
799   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
800   apj  = ptap->apj;
801   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
802   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
803 
804   /* create and assemble symbolic parallel matrix Cmpi */
805   /*----------------------------------------------------*/
806   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
807   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
808   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
809   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
810   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
811   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
812 
813   /* malloc apa for assembly Cmpi */
814   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
815 
816   ptap->apa = apa;
817   for (i=0; i<am; i++) {
818     row  = i + rstart;
819     apnz = api[i+1] - api[i];
820     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
821     apj += apnz;
822   }
823   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
824   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
825 
826   ptap->destroy             = Cmpi->ops->destroy;
827   ptap->duplicate           = Cmpi->ops->duplicate;
828   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
829   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
830   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
831 
832   /* attach the supporting struct to Cmpi for reuse */
833   c       = (Mat_MPIAIJ*)Cmpi->data;
834   c->ptap = ptap;
835 
836   *C = Cmpi;
837 
838   /* set MatInfo */
839   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
840   if (afill < 1.0) afill = 1.0;
841   Cmpi->info.mallocs           = nspacedouble;
842   Cmpi->info.fill_ratio_given  = fill;
843   Cmpi->info.fill_ratio_needed = afill;
844 
845 #if defined(PETSC_USE_INFO)
846   if (api[am]) {
847     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
848     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
849   } else {
850     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
851   }
852 #endif
853   PetscFunctionReturn(0);
854 }
855 
856 /*-------------------------------------------------------------------------*/
857 #undef __FUNCT__
858 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
859 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
860 {
861   PetscErrorCode ierr;
862   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
863   PetscInt       alg=0; /* set default algorithm */
864 
865   PetscFunctionBegin;
866   if (scall == MAT_INITIAL_MATRIX) {
867     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
868     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
869     ierr = PetscOptionsEnd();CHKERRQ(ierr);
870 
871     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
872     switch (alg) {
873     case 1:
874       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
875       break;
876     case 2:
877     {
878       Mat         Pt;
879       Mat_PtAPMPI *ptap;
880       Mat_MPIAIJ  *c;
881       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
882       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
883       c        = (Mat_MPIAIJ*)(*C)->data;
884       ptap     = c->ptap;
885       ptap->Pt = Pt;
886       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
887       PetscFunctionReturn(0);
888     }
889       break;
890     default:
891       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
892       break;
893     }
894     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
895   }
896   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
897   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
898   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 
902 /* This routine only works when scall=MAT_REUSE_MATRIX! */
903 #undef __FUNCT__
904 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
905 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
906 {
907   PetscErrorCode ierr;
908   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
909   Mat_PtAPMPI    *ptap= c->ptap;
910   Mat            Pt=ptap->Pt;
911 
912   PetscFunctionBegin;
913   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
914   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
915   PetscFunctionReturn(0);
916 }
917 
918 /* Non-scalable version, use dense axpy */
919 #undef __FUNCT__
920 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
921 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
922 {
923   PetscErrorCode      ierr;
924   Mat_Merge_SeqsToMPI *merge;
925   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
926   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
927   Mat_PtAPMPI         *ptap;
928   PetscInt            *adj,*aJ;
929   PetscInt            i,j,k,anz,pnz,row,*cj;
930   MatScalar           *ada,*aval,*ca,valtmp;
931   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
932   MPI_Comm            comm;
933   PetscMPIInt         size,rank,taga,*len_s;
934   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
935   PetscInt            **buf_ri,**buf_rj;
936   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
937   MPI_Request         *s_waits,*r_waits;
938   MPI_Status          *status;
939   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
940   PetscInt            *ai,*aj,*coi,*coj;
941   PetscInt            *poJ,*pdJ;
942   Mat                 A_loc;
943   Mat_SeqAIJ          *a_loc;
944 
945   PetscFunctionBegin;
946   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
947   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
948   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
949 
950   ptap  = c->ptap;
951   merge = ptap->merge;
952 
953   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
954   /*--------------------------------------------------------------*/
955   /* get data from symbolic products */
956   coi  = merge->coi; coj = merge->coj;
957   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
958 
959   bi     = merge->bi; bj = merge->bj;
960   owners = merge->rowmap->range;
961   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
962 
963   /* get A_loc by taking all local rows of A */
964   A_loc = ptap->A_loc;
965   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
966   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
967   ai    = a_loc->i;
968   aj    = a_loc->j;
969 
970   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
971 
972   for (i=0; i<am; i++) {
973     /* 2-a) put A[i,:] to dense array aval */
974     anz = ai[i+1] - ai[i];
975     adj = aj + ai[i];
976     ada = a_loc->a + ai[i];
977     for (j=0; j<anz; j++) {
978       aval[adj[j]] = ada[j];
979     }
980 
981     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
982     /*--------------------------------------------------------------*/
983     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
984     pnz = po->i[i+1] - po->i[i];
985     poJ = po->j + po->i[i];
986     pA  = po->a + po->i[i];
987     for (j=0; j<pnz; j++) {
988       row = poJ[j];
989       cnz = coi[row+1] - coi[row];
990       cj  = coj + coi[row];
991       ca  = coa + coi[row];
992       /* perform dense axpy */
993       valtmp = pA[j];
994       for (k=0; k<cnz; k++) {
995         ca[k] += valtmp*aval[cj[k]];
996       }
997       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
998     }
999 
1000     /* put the value into Cd (diagonal part) */
1001     pnz = pd->i[i+1] - pd->i[i];
1002     pdJ = pd->j + pd->i[i];
1003     pA  = pd->a + pd->i[i];
1004     for (j=0; j<pnz; j++) {
1005       row = pdJ[j];
1006       cnz = bi[row+1] - bi[row];
1007       cj  = bj + bi[row];
1008       ca  = ba + bi[row];
1009       /* perform dense axpy */
1010       valtmp = pA[j];
1011       for (k=0; k<cnz; k++) {
1012         ca[k] += valtmp*aval[cj[k]];
1013       }
1014       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1015     }
1016 
1017     /* zero the current row of Pt*A */
1018     aJ = aj + ai[i];
1019     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1020   }
1021 
1022   /* 3) send and recv matrix values coa */
1023   /*------------------------------------*/
1024   buf_ri = merge->buf_ri;
1025   buf_rj = merge->buf_rj;
1026   len_s  = merge->len_s;
1027   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1028   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1029 
1030   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1031   for (proc=0,k=0; proc<size; proc++) {
1032     if (!len_s[proc]) continue;
1033     i    = merge->owners_co[proc];
1034     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1035     k++;
1036   }
1037   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1038   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1039 
1040   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1041   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1042   ierr = PetscFree(coa);CHKERRQ(ierr);
1043 
1044   /* 4) insert local Cseq and received values into Cmpi */
1045   /*----------------------------------------------------*/
1046   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1047   for (k=0; k<merge->nrecv; k++) {
1048     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1049     nrows       = *(buf_ri_k[k]);
1050     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1051     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1052   }
1053 
1054   for (i=0; i<cm; i++) {
1055     row  = owners[rank] + i; /* global row index of C_seq */
1056     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1057     ba_i = ba + bi[i];
1058     bnz  = bi[i+1] - bi[i];
1059     /* add received vals into ba */
1060     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1061       /* i-th row */
1062       if (i == *nextrow[k]) {
1063         cnz    = *(nextci[k]+1) - *nextci[k];
1064         cj     = buf_rj[k] + *(nextci[k]);
1065         ca     = abuf_r[k] + *(nextci[k]);
1066         nextcj = 0;
1067         for (j=0; nextcj<cnz; j++) {
1068           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1069             ba_i[j] += ca[nextcj++];
1070           }
1071         }
1072         nextrow[k]++; nextci[k]++;
1073         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1074       }
1075     }
1076     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1077   }
1078   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1079   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1080 
1081   ierr = PetscFree(ba);CHKERRQ(ierr);
1082   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1083   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1084   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1085   ierr = PetscFree(aval);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1090 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1091 #undef __FUNCT__
1092 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1093 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1094 {
1095   PetscErrorCode      ierr;
1096   Mat                 Cmpi,A_loc,POt,PDt;
1097   Mat_PtAPMPI         *ptap;
1098   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1099   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1100   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1101   PetscInt            nnz;
1102   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1103   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1104   PetscBT             lnkbt;
1105   MPI_Comm            comm;
1106   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1107   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1108   PetscInt            len,proc,*dnz,*onz,*owners;
1109   PetscInt            nzi,*bi,*bj;
1110   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1111   MPI_Request         *swaits,*rwaits;
1112   MPI_Status          *sstatus,rstatus;
1113   Mat_Merge_SeqsToMPI *merge;
1114   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1115   PetscReal           afill  =1.0,afill_tmp;
1116   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1117   PetscScalar         *vals;
1118   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1119 
1120   PetscFunctionBegin;
1121   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1122   /* check if matrix local sizes are compatible */
1123   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1124 
1125   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1126   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1127 
1128   /* create struct Mat_PtAPMPI and attached it to C later */
1129   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1130 
1131   /* get A_loc by taking all local rows of A */
1132   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1133 
1134   ptap->A_loc = A_loc;
1135 
1136   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1137   ai    = a_loc->i;
1138   aj    = a_loc->j;
1139 
1140   /* determine symbolic Co=(p->B)^T*A - send to others */
1141   /*----------------------------------------------------*/
1142   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1143   pdt  = (Mat_SeqAIJ*)PDt->data;
1144   pdti = pdt->i; pdtj = pdt->j;
1145 
1146   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1147   pot  = (Mat_SeqAIJ*)POt->data;
1148   poti = pot->i; potj = pot->j;
1149 
1150   /* then, compute symbolic Co = (p->B)^T*A */
1151   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1152   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1153   coi[0] = 0;
1154 
1155   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1156   nnz           = fill*(poti[pon] + ai[am]);
1157   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1158   current_space = free_space;
1159 
1160   /* create and initialize a linked list */
1161   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1162 
1163   for (i=0; i<pon; i++) {
1164     pnz = poti[i+1] - poti[i];
1165     ptJ = potj + poti[i];
1166     for (j=0; j<pnz; j++) {
1167       row  = ptJ[j]; /* row of A_loc == col of Pot */
1168       anz  = ai[row+1] - ai[row];
1169       Jptr = aj + ai[row];
1170       /* add non-zero cols of AP into the sorted linked list lnk */
1171       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1172     }
1173     nnz = lnk[0];
1174 
1175     /* If free space is not available, double the total space in the list */
1176     if (current_space->local_remaining<nnz) {
1177       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1178       nspacedouble++;
1179     }
1180 
1181     /* Copy data into free space, and zero out denserows */
1182     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1183 
1184     current_space->array           += nnz;
1185     current_space->local_used      += nnz;
1186     current_space->local_remaining -= nnz;
1187 
1188     coi[i+1] = coi[i] + nnz;
1189   }
1190 
1191   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1192   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1193 
1194   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1195   if (afill_tmp > afill) afill = afill_tmp;
1196 
1197   /* send j-array (coj) of Co to other processors */
1198   /*----------------------------------------------*/
1199   /* determine row ownership */
1200   ierr = PetscNew(&merge);CHKERRQ(ierr);
1201   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1202 
1203   merge->rowmap->n  = pn;
1204   merge->rowmap->bs = 1;
1205 
1206   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1207   owners = merge->rowmap->range;
1208 
1209   /* determine the number of messages to send, their lengths */
1210   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1211   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1212 
1213   len_s        = merge->len_s;
1214   merge->nsend = 0;
1215 
1216   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1217   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1218 
1219   proc = 0;
1220   for (i=0; i<pon; i++) {
1221     while (prmap[i] >= owners[proc+1]) proc++;
1222     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1223     len_s[proc] += coi[i+1] - coi[i];
1224   }
1225 
1226   len          = 0; /* max length of buf_si[] */
1227   owners_co[0] = 0;
1228   for (proc=0; proc<size; proc++) {
1229     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1230     if (len_si[proc]) {
1231       merge->nsend++;
1232       len_si[proc] = 2*(len_si[proc] + 1);
1233       len         += len_si[proc];
1234     }
1235   }
1236 
1237   /* determine the number and length of messages to receive for coi and coj  */
1238   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1239   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1240 
1241   /* post the Irecv and Isend of coj */
1242   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1243   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1244   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1245   for (proc=0, k=0; proc<size; proc++) {
1246     if (!len_s[proc]) continue;
1247     i    = owners_co[proc];
1248     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1249     k++;
1250   }
1251 
1252   /* receives and sends of coj are complete */
1253   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1254   for (i=0; i<merge->nrecv; i++) {
1255     PetscMPIInt icompleted;
1256     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1257   }
1258   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1259   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1260 
1261   /* send and recv coi */
1262   /*-------------------*/
1263   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1264   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1265   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1266   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1267   for (proc=0,k=0; proc<size; proc++) {
1268     if (!len_s[proc]) continue;
1269     /* form outgoing message for i-structure:
1270          buf_si[0]:                 nrows to be sent
1271                [1:nrows]:           row index (global)
1272                [nrows+1:2*nrows+1]: i-structure index
1273     */
1274     /*-------------------------------------------*/
1275     nrows       = len_si[proc]/2 - 1;
1276     buf_si_i    = buf_si + nrows+1;
1277     buf_si[0]   = nrows;
1278     buf_si_i[0] = 0;
1279     nrows       = 0;
1280     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1281       nzi               = coi[i+1] - coi[i];
1282       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1283       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1284       nrows++;
1285     }
1286     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1287     k++;
1288     buf_si += len_si[proc];
1289   }
1290   i = merge->nrecv;
1291   while (i--) {
1292     PetscMPIInt icompleted;
1293     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1294   }
1295   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1296   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1297   ierr = PetscFree(len_si);CHKERRQ(ierr);
1298   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1299   ierr = PetscFree(swaits);CHKERRQ(ierr);
1300   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1301   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1302 
1303   /* compute the local portion of C (mpi mat) */
1304   /*------------------------------------------*/
1305   /* allocate bi array and free space for accumulating nonzero column info */
1306   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1307   bi[0] = 0;
1308 
1309   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1310   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1311   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1312   current_space = free_space;
1313 
1314   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1315   for (k=0; k<merge->nrecv; k++) {
1316     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1317     nrows       = *buf_ri_k[k];
1318     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1319     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1320   }
1321 
1322   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1323   rmax = 0;
1324   for (i=0; i<pn; i++) {
1325     /* add pdt[i,:]*AP into lnk */
1326     pnz = pdti[i+1] - pdti[i];
1327     ptJ = pdtj + pdti[i];
1328     for (j=0; j<pnz; j++) {
1329       row  = ptJ[j];  /* row of AP == col of Pt */
1330       anz  = ai[row+1] - ai[row];
1331       Jptr = aj + ai[row];
1332       /* add non-zero cols of AP into the sorted linked list lnk */
1333       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1334     }
1335 
1336     /* add received col data into lnk */
1337     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1338       if (i == *nextrow[k]) { /* i-th row */
1339         nzi  = *(nextci[k]+1) - *nextci[k];
1340         Jptr = buf_rj[k] + *nextci[k];
1341         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1342         nextrow[k]++; nextci[k]++;
1343       }
1344     }
1345     nnz = lnk[0];
1346 
1347     /* if free space is not available, make more free space */
1348     if (current_space->local_remaining<nnz) {
1349       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1350       nspacedouble++;
1351     }
1352     /* copy data into free space, then initialize lnk */
1353     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1354     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1355 
1356     current_space->array           += nnz;
1357     current_space->local_used      += nnz;
1358     current_space->local_remaining -= nnz;
1359 
1360     bi[i+1] = bi[i] + nnz;
1361     if (nnz > rmax) rmax = nnz;
1362   }
1363   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1364 
1365   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1366   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1367 
1368   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1369   if (afill_tmp > afill) afill = afill_tmp;
1370   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1371   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1372   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1373 
1374   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1375   /*----------------------------------------------------------------------------------*/
1376   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1377 
1378   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1379   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1380   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1381   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1382   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1383   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1384   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1385   for (i=0; i<pn; i++) {
1386     row  = i + rstart;
1387     nnz  = bi[i+1] - bi[i];
1388     Jptr = bj + bi[i];
1389     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1390   }
1391   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1392   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1393   ierr = PetscFree(vals);CHKERRQ(ierr);
1394 
1395   merge->bi        = bi;
1396   merge->bj        = bj;
1397   merge->coi       = coi;
1398   merge->coj       = coj;
1399   merge->buf_ri    = buf_ri;
1400   merge->buf_rj    = buf_rj;
1401   merge->owners_co = owners_co;
1402   merge->destroy   = Cmpi->ops->destroy;
1403   merge->duplicate = Cmpi->ops->duplicate;
1404 
1405   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1406   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1407   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1408 
1409   /* attach the supporting struct to Cmpi for reuse */
1410   c           = (Mat_MPIAIJ*)Cmpi->data;
1411   c->ptap     = ptap;
1412   ptap->api   = NULL;
1413   ptap->apj   = NULL;
1414   ptap->merge = merge;
1415   ptap->rmax  = rmax;
1416 
1417   *C = Cmpi;
1418 #if defined(PETSC_USE_INFO)
1419   if (bi[pn] != 0) {
1420     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1421     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1422   } else {
1423     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1424   }
1425 #endif
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1431 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1432 {
1433   PetscErrorCode      ierr;
1434   Mat_Merge_SeqsToMPI *merge;
1435   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1436   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1437   Mat_PtAPMPI         *ptap;
1438   PetscInt            *adj;
1439   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1440   MatScalar           *ada,*ca,valtmp;
1441   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1442   MPI_Comm            comm;
1443   PetscMPIInt         size,rank,taga,*len_s;
1444   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1445   PetscInt            **buf_ri,**buf_rj;
1446   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1447   MPI_Request         *s_waits,*r_waits;
1448   MPI_Status          *status;
1449   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1450   PetscInt            *ai,*aj,*coi,*coj;
1451   PetscInt            *poJ,*pdJ;
1452   Mat                 A_loc;
1453   Mat_SeqAIJ          *a_loc;
1454 
1455   PetscFunctionBegin;
1456   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1457   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1458   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1459 
1460   ptap  = c->ptap;
1461   merge = ptap->merge;
1462 
1463   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1464   /*------------------------------------------*/
1465   /* get data from symbolic products */
1466   coi    = merge->coi; coj = merge->coj;
1467   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1468   bi     = merge->bi; bj = merge->bj;
1469   owners = merge->rowmap->range;
1470   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1471 
1472   /* get A_loc by taking all local rows of A */
1473   A_loc = ptap->A_loc;
1474   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1475   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1476   ai    = a_loc->i;
1477   aj    = a_loc->j;
1478 
1479   for (i=0; i<am; i++) {
1480     anz = ai[i+1] - ai[i];
1481     adj = aj + ai[i];
1482     ada = a_loc->a + ai[i];
1483 
1484     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1485     /*-------------------------------------------------------------*/
1486     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1487     pnz = po->i[i+1] - po->i[i];
1488     poJ = po->j + po->i[i];
1489     pA  = po->a + po->i[i];
1490     for (j=0; j<pnz; j++) {
1491       row = poJ[j];
1492       cj  = coj + coi[row];
1493       ca  = coa + coi[row];
1494       /* perform sparse axpy */
1495       nexta  = 0;
1496       valtmp = pA[j];
1497       for (k=0; nexta<anz; k++) {
1498         if (cj[k] == adj[nexta]) {
1499           ca[k] += valtmp*ada[nexta];
1500           nexta++;
1501         }
1502       }
1503       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1504     }
1505 
1506     /* put the value into Cd (diagonal part) */
1507     pnz = pd->i[i+1] - pd->i[i];
1508     pdJ = pd->j + pd->i[i];
1509     pA  = pd->a + pd->i[i];
1510     for (j=0; j<pnz; j++) {
1511       row = pdJ[j];
1512       cj  = bj + bi[row];
1513       ca  = ba + bi[row];
1514       /* perform sparse axpy */
1515       nexta  = 0;
1516       valtmp = pA[j];
1517       for (k=0; nexta<anz; k++) {
1518         if (cj[k] == adj[nexta]) {
1519           ca[k] += valtmp*ada[nexta];
1520           nexta++;
1521         }
1522       }
1523       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1524     }
1525   }
1526 
1527   /* 3) send and recv matrix values coa */
1528   /*------------------------------------*/
1529   buf_ri = merge->buf_ri;
1530   buf_rj = merge->buf_rj;
1531   len_s  = merge->len_s;
1532   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1533   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1534 
1535   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1536   for (proc=0,k=0; proc<size; proc++) {
1537     if (!len_s[proc]) continue;
1538     i    = merge->owners_co[proc];
1539     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1540     k++;
1541   }
1542   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1543   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1544 
1545   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1546   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1547   ierr = PetscFree(coa);CHKERRQ(ierr);
1548 
1549   /* 4) insert local Cseq and received values into Cmpi */
1550   /*----------------------------------------------------*/
1551   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1552   for (k=0; k<merge->nrecv; k++) {
1553     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1554     nrows       = *(buf_ri_k[k]);
1555     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1556     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1557   }
1558 
1559   for (i=0; i<cm; i++) {
1560     row  = owners[rank] + i; /* global row index of C_seq */
1561     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1562     ba_i = ba + bi[i];
1563     bnz  = bi[i+1] - bi[i];
1564     /* add received vals into ba */
1565     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1566       /* i-th row */
1567       if (i == *nextrow[k]) {
1568         cnz    = *(nextci[k]+1) - *nextci[k];
1569         cj     = buf_rj[k] + *(nextci[k]);
1570         ca     = abuf_r[k] + *(nextci[k]);
1571         nextcj = 0;
1572         for (j=0; nextcj<cnz; j++) {
1573           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1574             ba_i[j] += ca[nextcj++];
1575           }
1576         }
1577         nextrow[k]++; nextci[k]++;
1578         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1579       }
1580     }
1581     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1582   }
1583   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1584   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1585 
1586   ierr = PetscFree(ba);CHKERRQ(ierr);
1587   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1588   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1589   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1594 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1595    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1598 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1599 {
1600   PetscErrorCode      ierr;
1601   Mat                 Cmpi,A_loc,POt,PDt;
1602   Mat_PtAPMPI         *ptap;
1603   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1604   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1605   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1606   PetscInt            nnz;
1607   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1608   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1609   MPI_Comm            comm;
1610   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1611   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1612   PetscInt            len,proc,*dnz,*onz,*owners;
1613   PetscInt            nzi,*bi,*bj;
1614   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1615   MPI_Request         *swaits,*rwaits;
1616   MPI_Status          *sstatus,rstatus;
1617   Mat_Merge_SeqsToMPI *merge;
1618   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1619   PetscReal           afill  =1.0,afill_tmp;
1620   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1621   PetscScalar         *vals;
1622   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1623   PetscTable          ta;
1624 
1625   PetscFunctionBegin;
1626   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1627   /* check if matrix local sizes are compatible */
1628   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1629 
1630   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1631   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1632 
1633   /* create struct Mat_PtAPMPI and attached it to C later */
1634   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1635 
1636   /* get A_loc by taking all local rows of A */
1637   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1638 
1639   ptap->A_loc = A_loc;
1640   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1641   ai          = a_loc->i;
1642   aj          = a_loc->j;
1643 
1644   /* determine symbolic Co=(p->B)^T*A - send to others */
1645   /*----------------------------------------------------*/
1646   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1647   pdt  = (Mat_SeqAIJ*)PDt->data;
1648   pdti = pdt->i; pdtj = pdt->j;
1649 
1650   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1651   pot  = (Mat_SeqAIJ*)POt->data;
1652   poti = pot->i; potj = pot->j;
1653 
1654   /* then, compute symbolic Co = (p->B)^T*A */
1655   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1656                          >= (num of nonzero rows of C_seq) - pn */
1657   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1658   coi[0] = 0;
1659 
1660   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1661   nnz           = fill*(poti[pon] + ai[am]);
1662   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1663   current_space = free_space;
1664 
1665   /* create and initialize a linked list */
1666   ierr = PetscTableCreate(2*a_loc->rmax,aN,&ta);CHKERRQ(ierr);
1667   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1668   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1669   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1670 
1671   for (i=0; i<pon; i++) {
1672     pnz = poti[i+1] - poti[i];
1673     ptJ = potj + poti[i];
1674     for (j=0; j<pnz; j++) {
1675       row  = ptJ[j]; /* row of A_loc == col of Pot */
1676       anz  = ai[row+1] - ai[row];
1677       Jptr = aj + ai[row];
1678       /* add non-zero cols of AP into the sorted linked list lnk */
1679       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1680     }
1681     nnz = lnk[0];
1682 
1683     /* If free space is not available, double the total space in the list */
1684     if (current_space->local_remaining<nnz) {
1685       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1686       nspacedouble++;
1687     }
1688 
1689     /* Copy data into free space, and zero out denserows */
1690     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1691 
1692     current_space->array           += nnz;
1693     current_space->local_used      += nnz;
1694     current_space->local_remaining -= nnz;
1695 
1696     coi[i+1] = coi[i] + nnz;
1697   }
1698 
1699   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1700   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1701   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1702 
1703   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1704   if (afill_tmp > afill) afill = afill_tmp;
1705 
1706   /* send j-array (coj) of Co to other processors */
1707   /*----------------------------------------------*/
1708   /* determine row ownership */
1709   ierr = PetscNew(&merge);CHKERRQ(ierr);
1710   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1711 
1712   merge->rowmap->n  = pn;
1713   merge->rowmap->bs = 1;
1714 
1715   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1716   owners = merge->rowmap->range;
1717 
1718   /* determine the number of messages to send, their lengths */
1719   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1720   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1721 
1722   len_s        = merge->len_s;
1723   merge->nsend = 0;
1724 
1725   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1726   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1727 
1728   proc = 0;
1729   for (i=0; i<pon; i++) {
1730     while (prmap[i] >= owners[proc+1]) proc++;
1731     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1732     len_s[proc] += coi[i+1] - coi[i];
1733   }
1734 
1735   len          = 0; /* max length of buf_si[] */
1736   owners_co[0] = 0;
1737   for (proc=0; proc<size; proc++) {
1738     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1739     if (len_si[proc]) {
1740       merge->nsend++;
1741       len_si[proc] = 2*(len_si[proc] + 1);
1742       len         += len_si[proc];
1743     }
1744   }
1745 
1746   /* determine the number and length of messages to receive for coi and coj  */
1747   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1748   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1749 
1750   /* post the Irecv and Isend of coj */
1751   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1752   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1753   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1754   for (proc=0, k=0; proc<size; proc++) {
1755     if (!len_s[proc]) continue;
1756     i    = owners_co[proc];
1757     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1758     k++;
1759   }
1760 
1761   /* receives and sends of coj are complete */
1762   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1763   for (i=0; i<merge->nrecv; i++) {
1764     PetscMPIInt icompleted;
1765     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1766   }
1767   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1768   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1769 
1770   /* add received column indices into table to update Armax */
1771   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1772     Jptr = buf_rj[k];
1773     for (j=0; j<merge->len_r[k]; j++) {
1774       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1775     }
1776   }
1777   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1778 
1779   /* send and recv coi */
1780   /*-------------------*/
1781   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1782   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1783   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1784   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1785   for (proc=0,k=0; proc<size; proc++) {
1786     if (!len_s[proc]) continue;
1787     /* form outgoing message for i-structure:
1788          buf_si[0]:                 nrows to be sent
1789                [1:nrows]:           row index (global)
1790                [nrows+1:2*nrows+1]: i-structure index
1791     */
1792     /*-------------------------------------------*/
1793     nrows       = len_si[proc]/2 - 1;
1794     buf_si_i    = buf_si + nrows+1;
1795     buf_si[0]   = nrows;
1796     buf_si_i[0] = 0;
1797     nrows       = 0;
1798     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1799       nzi               = coi[i+1] - coi[i];
1800       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1801       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1802       nrows++;
1803     }
1804     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1805     k++;
1806     buf_si += len_si[proc];
1807   }
1808   i = merge->nrecv;
1809   while (i--) {
1810     PetscMPIInt icompleted;
1811     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1812   }
1813   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1814   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1815   ierr = PetscFree(len_si);CHKERRQ(ierr);
1816   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1817   ierr = PetscFree(swaits);CHKERRQ(ierr);
1818   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1819   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1820 
1821   /* compute the local portion of C (mpi mat) */
1822   /*------------------------------------------*/
1823   /* allocate bi array and free space for accumulating nonzero column info */
1824   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1825   bi[0] = 0;
1826 
1827   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1828   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1829   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1830   current_space = free_space;
1831 
1832   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1833   for (k=0; k<merge->nrecv; k++) {
1834     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1835     nrows       = *buf_ri_k[k];
1836     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1837     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1838   }
1839 
1840   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1841   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1842   rmax = 0;
1843   for (i=0; i<pn; i++) {
1844     /* add pdt[i,:]*AP into lnk */
1845     pnz = pdti[i+1] - pdti[i];
1846     ptJ = pdtj + pdti[i];
1847     for (j=0; j<pnz; j++) {
1848       row  = ptJ[j];  /* row of AP == col of Pt */
1849       anz  = ai[row+1] - ai[row];
1850       Jptr = aj + ai[row];
1851       /* add non-zero cols of AP into the sorted linked list lnk */
1852       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1853     }
1854 
1855     /* add received col data into lnk */
1856     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1857       if (i == *nextrow[k]) { /* i-th row */
1858         nzi  = *(nextci[k]+1) - *nextci[k];
1859         Jptr = buf_rj[k] + *nextci[k];
1860         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1861         nextrow[k]++; nextci[k]++;
1862       }
1863     }
1864     nnz = lnk[0];
1865 
1866     /* if free space is not available, make more free space */
1867     if (current_space->local_remaining<nnz) {
1868       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1869       nspacedouble++;
1870     }
1871     /* copy data into free space, then initialize lnk */
1872     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1873     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1874 
1875     current_space->array           += nnz;
1876     current_space->local_used      += nnz;
1877     current_space->local_remaining -= nnz;
1878 
1879     bi[i+1] = bi[i] + nnz;
1880     if (nnz > rmax) rmax = nnz;
1881   }
1882   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1883 
1884   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1885   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1886   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1887   if (afill_tmp > afill) afill = afill_tmp;
1888   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1889   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1890 
1891   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1892   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1893 
1894   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1895   /*----------------------------------------------------------------------------------*/
1896   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1897 
1898   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1899   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1900   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1901   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1902   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1903   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1904   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1905   for (i=0; i<pn; i++) {
1906     row  = i + rstart;
1907     nnz  = bi[i+1] - bi[i];
1908     Jptr = bj + bi[i];
1909     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1910   }
1911   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1912   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1913   ierr = PetscFree(vals);CHKERRQ(ierr);
1914 
1915   merge->bi        = bi;
1916   merge->bj        = bj;
1917   merge->coi       = coi;
1918   merge->coj       = coj;
1919   merge->buf_ri    = buf_ri;
1920   merge->buf_rj    = buf_rj;
1921   merge->owners_co = owners_co;
1922   merge->destroy   = Cmpi->ops->destroy;
1923   merge->duplicate = Cmpi->ops->duplicate;
1924 
1925   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1926   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1927   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1928 
1929   /* attach the supporting struct to Cmpi for reuse */
1930   c = (Mat_MPIAIJ*)Cmpi->data;
1931 
1932   c->ptap     = ptap;
1933   ptap->api   = NULL;
1934   ptap->apj   = NULL;
1935   ptap->merge = merge;
1936   ptap->rmax  = rmax;
1937   ptap->apa   = NULL;
1938 
1939   *C = Cmpi;
1940 #if defined(PETSC_USE_INFO)
1941   if (bi[pn] != 0) {
1942     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1943     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1944   } else {
1945     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1946   }
1947 #endif
1948   PetscFunctionReturn(0);
1949 }
1950