xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 972064b65a4dc64f53e590d2cefc17fe4e1ae983)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 /* #define PTAP_PROFILE */
14 
15 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
16 {
17   PetscErrorCode    ierr;
18   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
19   Mat_APMPI         *ptap=a->ap;
20   PetscBool         iascii;
21   PetscViewerFormat format;
22 
23   PetscFunctionBegin;
24   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
25   if (iascii) {
26     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
27     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28       if (ptap->algType == 0) {
29         ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr);
30       } else if (ptap->algType == 1) {
31         ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr);
32       }
33     }
34   }
35   ierr = (ptap->view)(A,viewer);CHKERRQ(ierr);
36   PetscFunctionReturn(0);
37 }
38 
39 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
40 {
41   PetscErrorCode      ierr;
42   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data;
43   Mat_APMPI           *ptap=a->ap;
44   Mat_Merge_SeqsToMPI *merge;
45 
46   PetscFunctionBegin;
47   if (!ptap) PetscFunctionReturn(0);
48 
49   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
50   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
51   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
52   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
53   ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
54   ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
56   if (ptap->AP_loc) { /* used by alg_rap */
57     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
58     ierr = PetscFree(ap->i);CHKERRQ(ierr);
59     ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
60     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
61   } else { /* used by alg_ptap */
62     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
63     ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
64   }
65   ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
66   ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
67   if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
68 
69   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
70 
71   merge=ptap->merge;
72   if (merge) { /* used by alg_ptap */
73     ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
74     ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
75     ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
76     ierr = PetscFree(merge->bi);CHKERRQ(ierr);
77     ierr = PetscFree(merge->bj);CHKERRQ(ierr);
78     ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
79     ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
80     ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
81     ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
82     ierr = PetscFree(merge->coi);CHKERRQ(ierr);
83     ierr = PetscFree(merge->coj);CHKERRQ(ierr);
84     ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
85     ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
86     ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
87   }
88 
89   A->ops->destroy = ptap->destroy;
90   ierr = PetscFree(a->ap);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
95 {
96   PetscErrorCode ierr;
97 
98   PetscFunctionBegin;
99   ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr);
100   ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
105 {
106   PetscErrorCode ierr;
107   PetscBool      flg;
108   MPI_Comm       comm;
109 #if !defined(PETSC_HAVE_HYPRE)
110   const char          *algTypes[2] = {"scalable","nonscalable"};
111   PetscInt            nalg=2;
112 #else
113   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
114   PetscInt            nalg=3;
115 #endif
116   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */
117 
118   PetscFunctionBegin;
119   /* check if matrix local sizes are compatible */
120   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
121   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
122   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
123 
124   if (scall == MAT_INITIAL_MATRIX) {
125     /* pick an algorithm */
126     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
127     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
128     ierr = PetscOptionsEnd();CHKERRQ(ierr);
129 
130     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
131       MatInfo     Ainfo,Pinfo;
132       PetscInt    nz_local;
133       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;
134 
135       ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr);
136       ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr);
137       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
138 
139       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
140       ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr);
141 
142       if (alg_scalable) {
143         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
144       }
145     }
146 
147     switch (alg) {
148     case 1:
149       /* do R=P^T locally, then C=R*A*P -- nonscalable */
150       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
151       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
152       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
153       break;
154 #if defined(PETSC_HAVE_HYPRE)
155     case 2:
156       /* Use boomerAMGBuildCoarseOperator */
157       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
158       ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
159       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
160       break;
161 #endif
162     default:
163       /* do R=P^T locally, then C=R*A*P */
164       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
165       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr);
166       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
167       break;
168     }
169 
170     if (alg == 0 || alg == 1) {
171       Mat_MPIAIJ *c  = (Mat_MPIAIJ*)(*C)->data;
172       Mat_APMPI  *ap = c->ap;
173       ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr);
174       ap->freestruct = PETSC_FALSE;
175       ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr);
176       ierr = PetscOptionsEnd();CHKERRQ(ierr);
177     }
178   }
179 
180   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
181   ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr);
182   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
187 {
188   PetscErrorCode    ierr;
189   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
190   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
191   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
192   Mat_APMPI         *ptap = c->ap;
193   Mat               AP_loc,C_loc,C_oth;
194   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
195   PetscScalar       *apa;
196   const PetscInt    *cols;
197   const PetscScalar *vals;
198 
199   PetscFunctionBegin;
200   if (!ptap) {
201     MPI_Comm comm;
202     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
203     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
204   }
205 
206   ierr = MatZeroEntries(C);CHKERRQ(ierr);
207 
208   /* 1) get R = Pd^T,Ro = Po^T */
209   if (ptap->reuse == MAT_REUSE_MATRIX) {
210     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
211     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
212   }
213 
214   /* 2) get AP_loc */
215   AP_loc = ptap->AP_loc;
216   ap = (Mat_SeqAIJ*)AP_loc->data;
217 
218   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
219   /*-----------------------------------------------------*/
220   if (ptap->reuse == MAT_REUSE_MATRIX) {
221     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
222     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
223     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
224   }
225 
226   /* 2-2) compute numeric A_loc*P - dominating part */
227   /* ---------------------------------------------- */
228   /* get data from symbolic products */
229   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
230   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
231   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
232 
233   api   = ap->i;
234   apj   = ap->j;
235   for (i=0; i<am; i++) {
236     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
237     apnz = api[i+1] - api[i];
238     apa = ap->a + api[i];
239     ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr);
240     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
241     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
242   }
243 
244   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
245   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
246   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
247 
248   C_loc = ptap->C_loc;
249   C_oth = ptap->C_oth;
250 
251   /* add C_loc and Co to to C */
252   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
253 
254   /* C_loc -> C */
255   cm    = C_loc->rmap->N;
256   c_seq = (Mat_SeqAIJ*)C_loc->data;
257   cols = c_seq->j;
258   vals = c_seq->a;
259 
260   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
261   /* when there are no off-processor parts.  */
262   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
263   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
264   /* a table, and other, more complex stuff has to be done. */
265   if (C->assembled) {
266     C->was_assembled = PETSC_TRUE;
267     C->assembled     = PETSC_FALSE;
268   }
269   if (C->was_assembled) {
270     for (i=0; i<cm; i++) {
271       ncols = c_seq->i[i+1] - c_seq->i[i];
272       row = rstart + i;
273       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
274       cols += ncols; vals += ncols;
275     }
276   } else {
277     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
278   }
279 
280   /* Co -> C, off-processor part */
281   cm = C_oth->rmap->N;
282   c_seq = (Mat_SeqAIJ*)C_oth->data;
283   cols = c_seq->j;
284   vals = c_seq->a;
285   for (i=0; i<cm; i++) {
286     ncols = c_seq->i[i+1] - c_seq->i[i];
287     row = p->garray[i];
288     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
289     cols += ncols; vals += ncols;
290   }
291   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
292   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
293 
294   ptap->reuse = MAT_REUSE_MATRIX;
295 
296   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
297   if (ptap->freestruct) {
298     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
299   }
300   PetscFunctionReturn(0);
301 }
302 
303 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
304 {
305   PetscErrorCode      ierr;
306   Mat_APMPI           *ptap;
307   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
308   MPI_Comm            comm;
309   PetscMPIInt         size,rank;
310   Mat                 Cmpi,P_loc,P_oth;
311   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
312   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
313   PetscInt            *lnk,i,k,pnz,row,nsend;
314   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
315   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
316   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
317   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
318   MPI_Request         *swaits,*rwaits;
319   MPI_Status          *sstatus,rstatus;
320   PetscLayout         rowmap;
321   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
322   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
323   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
324   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
325   PetscScalar         *apv;
326   PetscTable          ta;
327   MatType             mtype;
328 #if defined(PETSC_USE_INFO)
329   PetscReal           apfill;
330 #endif
331 
332   PetscFunctionBegin;
333   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
334   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
335   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
336 
337   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
338 
339   /* create symbolic parallel matrix Cmpi */
340   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
341   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
342   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
343 
344   /* create struct Mat_APMPI and attached it to C later */
345   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
346   ptap->reuse = MAT_INITIAL_MATRIX;
347   ptap->algType = 0;
348 
349   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
350   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
351   /* get P_loc by taking all local rows of P */
352   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
353 
354   ptap->P_loc = P_loc;
355   ptap->P_oth = P_oth;
356 
357   /* (0) compute Rd = Pd^T, Ro = Po^T  */
358   /* --------------------------------- */
359   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
360   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
361 
362   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
363   /* ----------------------------------------------------------------- */
364   p_loc  = (Mat_SeqAIJ*)P_loc->data;
365   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
366 
367   /* create and initialize a linked list */
368   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
369   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
370   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
371   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
372 
373   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
374 
375   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
376   if (ao) {
377     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
378   } else {
379     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
380   }
381   current_space = free_space;
382   nspacedouble  = 0;
383 
384   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
385   api[0] = 0;
386   for (i=0; i<am; i++) {
387     /* diagonal portion: Ad[i,:]*P */
388     ai = ad->i; pi = p_loc->i;
389     nzi = ai[i+1] - ai[i];
390     aj  = ad->j + ai[i];
391     for (j=0; j<nzi; j++) {
392       row  = aj[j];
393       pnz  = pi[row+1] - pi[row];
394       Jptr = p_loc->j + pi[row];
395       /* add non-zero cols of P into the sorted linked list lnk */
396       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
397     }
398     /* off-diagonal portion: Ao[i,:]*P */
399     if (ao) {
400       ai = ao->i; pi = p_oth->i;
401       nzi = ai[i+1] - ai[i];
402       aj  = ao->j + ai[i];
403       for (j=0; j<nzi; j++) {
404         row  = aj[j];
405         pnz  = pi[row+1] - pi[row];
406         Jptr = p_oth->j + pi[row];
407         ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
408       }
409     }
410     apnz     = lnk[0];
411     api[i+1] = api[i] + apnz;
412 
413     /* if free space is not available, double the total space in the list */
414     if (current_space->local_remaining<apnz) {
415       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
416       nspacedouble++;
417     }
418 
419     /* Copy data into free space, then initialize lnk */
420     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
421 
422     current_space->array           += apnz;
423     current_space->local_used      += apnz;
424     current_space->local_remaining -= apnz;
425   }
426   /* Allocate space for apj and apv, initialize apj, and */
427   /* destroy list of free space and other temporary array(s) */
428   ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
429   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
430   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
431 
432   /* Create AP_loc for reuse */
433   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
434 
435 #if defined(PETSC_USE_INFO)
436   if (ao) {
437     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
438   } else {
439     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
440   }
441   ptap->AP_loc->info.mallocs           = nspacedouble;
442   ptap->AP_loc->info.fill_ratio_given  = fill;
443   ptap->AP_loc->info.fill_ratio_needed = apfill;
444 
445   if (api[am]) {
446     ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
447     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
448   } else {
449     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
450   }
451 #endif
452 
453   /* (2-1) compute symbolic Co = Ro*AP_loc  */
454   /* ------------------------------------ */
455   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
456 
457   /* (3) send coj of C_oth to other processors  */
458   /* ------------------------------------------ */
459   /* determine row ownership */
460   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
461   rowmap->n  = pn;
462   rowmap->bs = 1;
463   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
464   owners = rowmap->range;
465 
466   /* determine the number of messages to send, their lengths */
467   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
468   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
469   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
470 
471   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
472   coi   = c_oth->i; coj = c_oth->j;
473   con   = ptap->C_oth->rmap->n;
474   proc  = 0;
475   for (i=0; i<con; i++) {
476     while (prmap[i] >= owners[proc+1]) proc++;
477     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
478     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
479   }
480 
481   len          = 0; /* max length of buf_si[], see (4) */
482   owners_co[0] = 0;
483   nsend        = 0;
484   for (proc=0; proc<size; proc++) {
485     owners_co[proc+1] = owners_co[proc] + len_si[proc];
486     if (len_s[proc]) {
487       nsend++;
488       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
489       len         += len_si[proc];
490     }
491   }
492 
493   /* determine the number and length of messages to receive for coi and coj  */
494   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
495   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
496 
497   /* post the Irecv and Isend of coj */
498   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
499   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
500   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
501   for (proc=0, k=0; proc<size; proc++) {
502     if (!len_s[proc]) continue;
503     i    = owners_co[proc];
504     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
505     k++;
506   }
507 
508   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
509   /* ---------------------------------------- */
510   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
511   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
512 
513   /* receives coj are complete */
514   for (i=0; i<nrecv; i++) {
515     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
516   }
517   ierr = PetscFree(rwaits);CHKERRQ(ierr);
518   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
519 
520   /* add received column indices into ta to update Crmax */
521   for (k=0; k<nrecv; k++) {/* k-th received message */
522     Jptr = buf_rj[k];
523     for (j=0; j<len_r[k]; j++) {
524       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
525     }
526   }
527   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
528   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
529 
530   /* (4) send and recv coi */
531   /*-----------------------*/
532   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
533   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
534   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
535   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
536   for (proc=0,k=0; proc<size; proc++) {
537     if (!len_s[proc]) continue;
538     /* form outgoing message for i-structure:
539          buf_si[0]:                 nrows to be sent
540                [1:nrows]:           row index (global)
541                [nrows+1:2*nrows+1]: i-structure index
542     */
543     /*-------------------------------------------*/
544     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
545     buf_si_i    = buf_si + nrows+1;
546     buf_si[0]   = nrows;
547     buf_si_i[0] = 0;
548     nrows       = 0;
549     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
550       nzi = coi[i+1] - coi[i];
551       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
552       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
553       nrows++;
554     }
555     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
556     k++;
557     buf_si += len_si[proc];
558   }
559   for (i=0; i<nrecv; i++) {
560     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
561   }
562   ierr = PetscFree(rwaits);CHKERRQ(ierr);
563   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
564 
565   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
566   ierr = PetscFree(len_ri);CHKERRQ(ierr);
567   ierr = PetscFree(swaits);CHKERRQ(ierr);
568   ierr = PetscFree(buf_s);CHKERRQ(ierr);
569 
570   /* (5) compute the local portion of Cmpi      */
571   /* ------------------------------------------ */
572   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
573   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
574   current_space = free_space;
575 
576   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
577   for (k=0; k<nrecv; k++) {
578     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
579     nrows       = *buf_ri_k[k];
580     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
581     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
582   }
583 
584   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
585   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
586   for (i=0; i<pn; i++) {
587     /* add C_loc into Cmpi */
588     nzi  = c_loc->i[i+1] - c_loc->i[i];
589     Jptr = c_loc->j + c_loc->i[i];
590     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
591 
592     /* add received col data into lnk */
593     for (k=0; k<nrecv; k++) { /* k-th received message */
594       if (i == *nextrow[k]) { /* i-th row */
595         nzi  = *(nextci[k]+1) - *nextci[k];
596         Jptr = buf_rj[k] + *nextci[k];
597         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
598         nextrow[k]++; nextci[k]++;
599       }
600     }
601     nzi = lnk[0];
602 
603     /* copy data into free space, then initialize lnk */
604     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
605     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
606   }
607   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
608   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
609   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
610 
611   /* local sizes and preallocation */
612   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
613   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
614   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
615   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
616 
617   /* members in merge */
618   ierr = PetscFree(id_r);CHKERRQ(ierr);
619   ierr = PetscFree(len_r);CHKERRQ(ierr);
620   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
621   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
622   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
623   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
624   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
625 
626   /* attach the supporting struct to Cmpi for reuse */
627   c = (Mat_MPIAIJ*)Cmpi->data;
628   c->ap           = ptap;
629   ptap->duplicate = Cmpi->ops->duplicate;
630   ptap->destroy   = Cmpi->ops->destroy;
631   ptap->view      = Cmpi->ops->view;
632 
633   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
634   Cmpi->assembled        = PETSC_FALSE;
635   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
636   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
637   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
638   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
639   *C                     = Cmpi;
640   PetscFunctionReturn(0);
641 }
642 
643 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
644 {
645   PetscErrorCode      ierr;
646   Mat_APMPI           *ptap;
647   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
648   MPI_Comm            comm;
649   PetscMPIInt         size,rank;
650   Mat                 Cmpi;
651   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
652   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
653   PetscInt            *lnk,i,k,pnz,row,nsend;
654   PetscBT             lnkbt;
655   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
656   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
657   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
658   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
659   MPI_Request         *swaits,*rwaits;
660   MPI_Status          *sstatus,rstatus;
661   PetscLayout         rowmap;
662   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
663   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
664   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
665   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
666   PetscScalar         *apv;
667   PetscTable          ta;
668   MatType             mtype;
669 #if defined(PETSC_USE_INFO)
670   PetscReal           apfill;
671 #endif
672 
673   PetscFunctionBegin;
674   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
675   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
676   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
677 
678   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
679 
680   /* create symbolic parallel matrix Cmpi */
681   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
682   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
683   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
684 
685   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
686   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
687 
688   /* create struct Mat_APMPI and attached it to C later */
689   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
690   ptap->reuse = MAT_INITIAL_MATRIX;
691   ptap->algType = 1;
692 
693   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
694   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
695   /* get P_loc by taking all local rows of P */
696   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
697 
698   /* (0) compute Rd = Pd^T, Ro = Po^T  */
699   /* --------------------------------- */
700   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
701   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
702 
703   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
704   /* ----------------------------------------------------------------- */
705   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
706   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
707 
708   /* create and initialize a linked list */
709   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
710   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
711   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
712   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
713   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
714 
715   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
716 
717   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
718   if (ao) {
719     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
720   } else {
721     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
722   }
723   current_space = free_space;
724   nspacedouble  = 0;
725 
726   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
727   api[0] = 0;
728   for (i=0; i<am; i++) {
729     /* diagonal portion: Ad[i,:]*P */
730     ai = ad->i; pi = p_loc->i;
731     nzi = ai[i+1] - ai[i];
732     aj  = ad->j + ai[i];
733     for (j=0; j<nzi; j++) {
734       row  = aj[j];
735       pnz  = pi[row+1] - pi[row];
736       Jptr = p_loc->j + pi[row];
737       /* add non-zero cols of P into the sorted linked list lnk */
738       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
739     }
740     /* off-diagonal portion: Ao[i,:]*P */
741     if (ao) {
742       ai = ao->i; pi = p_oth->i;
743       nzi = ai[i+1] - ai[i];
744       aj  = ao->j + ai[i];
745       for (j=0; j<nzi; j++) {
746         row  = aj[j];
747         pnz  = pi[row+1] - pi[row];
748         Jptr = p_oth->j + pi[row];
749         ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
750       }
751     }
752     apnz     = lnk[0];
753     api[i+1] = api[i] + apnz;
754     if (ap_rmax < apnz) ap_rmax = apnz;
755 
756     /* if free space is not available, double the total space in the list */
757     if (current_space->local_remaining<apnz) {
758       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
759       nspacedouble++;
760     }
761 
762     /* Copy data into free space, then initialize lnk */
763     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
764 
765     current_space->array           += apnz;
766     current_space->local_used      += apnz;
767     current_space->local_remaining -= apnz;
768   }
769   /* Allocate space for apj and apv, initialize apj, and */
770   /* destroy list of free space and other temporary array(s) */
771   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
772   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
773   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
774 
775   /* Create AP_loc for reuse */
776   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
777 
778 #if defined(PETSC_USE_INFO)
779   if (ao) {
780     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
781   } else {
782     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
783   }
784   ptap->AP_loc->info.mallocs           = nspacedouble;
785   ptap->AP_loc->info.fill_ratio_given  = fill;
786   ptap->AP_loc->info.fill_ratio_needed = apfill;
787 
788   if (api[am]) {
789     ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
790     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
791   } else {
792     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
793   }
794 #endif
795 
796   /* (2-1) compute symbolic Co = Ro*AP_loc  */
797   /* ------------------------------------ */
798   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
799 
800   /* (3) send coj of C_oth to other processors  */
801   /* ------------------------------------------ */
802   /* determine row ownership */
803   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
804   rowmap->n  = pn;
805   rowmap->bs = 1;
806   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
807   owners = rowmap->range;
808 
809   /* determine the number of messages to send, their lengths */
810   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
811   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
812   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
813 
814   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
815   coi   = c_oth->i; coj = c_oth->j;
816   con   = ptap->C_oth->rmap->n;
817   proc  = 0;
818   for (i=0; i<con; i++) {
819     while (prmap[i] >= owners[proc+1]) proc++;
820     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
821     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
822   }
823 
824   len          = 0; /* max length of buf_si[], see (4) */
825   owners_co[0] = 0;
826   nsend        = 0;
827   for (proc=0; proc<size; proc++) {
828     owners_co[proc+1] = owners_co[proc] + len_si[proc];
829     if (len_s[proc]) {
830       nsend++;
831       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
832       len         += len_si[proc];
833     }
834   }
835 
836   /* determine the number and length of messages to receive for coi and coj  */
837   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
838   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
839 
840   /* post the Irecv and Isend of coj */
841   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
842   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
843   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
844   for (proc=0, k=0; proc<size; proc++) {
845     if (!len_s[proc]) continue;
846     i    = owners_co[proc];
847     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
848     k++;
849   }
850 
851   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
852   /* ---------------------------------------- */
853   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
854   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
855 
856   /* receives coj are complete */
857   for (i=0; i<nrecv; i++) {
858     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
859   }
860   ierr = PetscFree(rwaits);CHKERRQ(ierr);
861   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
862 
863   /* add received column indices into ta to update Crmax */
864   for (k=0; k<nrecv; k++) {/* k-th received message */
865     Jptr = buf_rj[k];
866     for (j=0; j<len_r[k]; j++) {
867       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
868     }
869   }
870   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
871   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
872 
873   /* (4) send and recv coi */
874   /*-----------------------*/
875   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
876   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
877   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
878   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
879   for (proc=0,k=0; proc<size; proc++) {
880     if (!len_s[proc]) continue;
881     /* form outgoing message for i-structure:
882          buf_si[0]:                 nrows to be sent
883                [1:nrows]:           row index (global)
884                [nrows+1:2*nrows+1]: i-structure index
885     */
886     /*-------------------------------------------*/
887     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
888     buf_si_i    = buf_si + nrows+1;
889     buf_si[0]   = nrows;
890     buf_si_i[0] = 0;
891     nrows       = 0;
892     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
893       nzi = coi[i+1] - coi[i];
894       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
895       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
896       nrows++;
897     }
898     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
899     k++;
900     buf_si += len_si[proc];
901   }
902   for (i=0; i<nrecv; i++) {
903     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
904   }
905   ierr = PetscFree(rwaits);CHKERRQ(ierr);
906   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
907 
908   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
909   ierr = PetscFree(len_ri);CHKERRQ(ierr);
910   ierr = PetscFree(swaits);CHKERRQ(ierr);
911   ierr = PetscFree(buf_s);CHKERRQ(ierr);
912 
913   /* (5) compute the local portion of Cmpi      */
914   /* ------------------------------------------ */
915   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
916   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
917   current_space = free_space;
918 
919   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
920   for (k=0; k<nrecv; k++) {
921     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
922     nrows       = *buf_ri_k[k];
923     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
924     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
925   }
926 
927   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
928   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
929   for (i=0; i<pn; i++) {
930     /* add C_loc into Cmpi */
931     nzi  = c_loc->i[i+1] - c_loc->i[i];
932     Jptr = c_loc->j + c_loc->i[i];
933     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
934 
935     /* add received col data into lnk */
936     for (k=0; k<nrecv; k++) { /* k-th received message */
937       if (i == *nextrow[k]) { /* i-th row */
938         nzi  = *(nextci[k]+1) - *nextci[k];
939         Jptr = buf_rj[k] + *nextci[k];
940         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
941         nextrow[k]++; nextci[k]++;
942       }
943     }
944     nzi = lnk[0];
945 
946     /* copy data into free space, then initialize lnk */
947     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
948     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
949   }
950   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
951   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
952   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
953 
954   /* local sizes and preallocation */
955   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
956   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
957   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
958   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
959 
960   /* members in merge */
961   ierr = PetscFree(id_r);CHKERRQ(ierr);
962   ierr = PetscFree(len_r);CHKERRQ(ierr);
963   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
964   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
965   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
966   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
967   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
968 
969   /* attach the supporting struct to Cmpi for reuse */
970   c = (Mat_MPIAIJ*)Cmpi->data;
971   c->ap           = ptap;
972   ptap->duplicate = Cmpi->ops->duplicate;
973   ptap->destroy   = Cmpi->ops->destroy;
974   ptap->view      = Cmpi->ops->view;
975   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
976 
977   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
978   Cmpi->assembled        = PETSC_FALSE;
979   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
980   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
981   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
982   *C                     = Cmpi;
983   PetscFunctionReturn(0);
984 }
985 
986 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
987 {
988   PetscErrorCode    ierr;
989   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
990   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
991   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
992   Mat_APMPI         *ptap = c->ap;
993   Mat               AP_loc,C_loc,C_oth;
994   PetscInt          i,rstart,rend,cm,ncols,row;
995   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
996   PetscScalar       *apa;
997   const PetscInt    *cols;
998   const PetscScalar *vals;
999 
1000   PetscFunctionBegin;
1001   if (!ptap) {
1002     MPI_Comm comm;
1003     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1004     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1005   }
1006 
1007   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1008   /* 1) get R = Pd^T,Ro = Po^T */
1009   if (ptap->reuse == MAT_REUSE_MATRIX) {
1010     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1011     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
1012   }
1013 
1014   /* 2) get AP_loc */
1015   AP_loc = ptap->AP_loc;
1016   ap = (Mat_SeqAIJ*)AP_loc->data;
1017 
1018   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1019   /*-----------------------------------------------------*/
1020   if (ptap->reuse == MAT_REUSE_MATRIX) {
1021     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1022     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1023     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1024   }
1025 
1026   /* 2-2) compute numeric A_loc*P - dominating part */
1027   /* ---------------------------------------------- */
1028   /* get data from symbolic products */
1029   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1030   if (ptap->P_oth) {
1031     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1032   }
1033   apa   = ptap->apa;
1034   api   = ap->i;
1035   apj   = ap->j;
1036   for (i=0; i<am; i++) {
1037     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1038     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1039     apnz = api[i+1] - api[i];
1040     for (j=0; j<apnz; j++) {
1041       col = apj[j+api[i]];
1042       ap->a[j+ap->i[i]] = apa[col];
1043       apa[col] = 0.0;
1044     }
1045     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1046   }
1047 
1048   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1049   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
1050   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
1051   C_loc = ptap->C_loc;
1052   C_oth = ptap->C_oth;
1053 
1054   /* add C_loc and Co to to C */
1055   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
1056 
1057   /* C_loc -> C */
1058   cm    = C_loc->rmap->N;
1059   c_seq = (Mat_SeqAIJ*)C_loc->data;
1060   cols = c_seq->j;
1061   vals = c_seq->a;
1062 
1063 
1064   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1065   /* when there are no off-processor parts.  */
1066   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1067   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1068   /* a table, and other, more complex stuff has to be done. */
1069   if (C->assembled) {
1070     C->was_assembled = PETSC_TRUE;
1071     C->assembled     = PETSC_FALSE;
1072   }
1073   if (C->was_assembled) {
1074     for (i=0; i<cm; i++) {
1075       ncols = c_seq->i[i+1] - c_seq->i[i];
1076       row = rstart + i;
1077       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
1078       cols += ncols; vals += ncols;
1079     }
1080   } else {
1081     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
1082   }
1083 
1084   /* Co -> C, off-processor part */
1085   cm = C_oth->rmap->N;
1086   c_seq = (Mat_SeqAIJ*)C_oth->data;
1087   cols = c_seq->j;
1088   vals = c_seq->a;
1089   for (i=0; i<cm; i++) {
1090     ncols = c_seq->i[i+1] - c_seq->i[i];
1091     row = p->garray[i];
1092     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
1093     cols += ncols; vals += ncols;
1094   }
1095 
1096   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1097   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1098 
1099   ptap->reuse = MAT_REUSE_MATRIX;
1100 
1101   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1102   if (ptap->freestruct) {
1103     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
1104   }
1105   PetscFunctionReturn(0);
1106 }
1107