xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 8fb5bd83c3955fefcf33a54e3bb66920a9fa884b)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17 {
18   PetscBool         iascii;
19   PetscViewerFormat format;
20   Mat_APMPI         *ptap;
21 
22   PetscFunctionBegin;
23   MatCheckProduct(A,1);
24   ptap = (Mat_APMPI*)A->product->data;
25   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
26   if (iascii) {
27     PetscCall(PetscViewerGetFormat(viewer,&format));
28     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29       if (ptap->algType == 0) {
30         PetscCall(PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n"));
31       } else if (ptap->algType == 1) {
32         PetscCall(PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n"));
33       } else if (ptap->algType == 2) {
34         PetscCall(PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n"));
35       } else if (ptap->algType == 3) {
36         PetscCall(PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n"));
37       }
38     }
39   }
40   PetscFunctionReturn(0);
41 }
42 
43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44 {
45   Mat_APMPI           *ptap = (Mat_APMPI*)data;
46   Mat_Merge_SeqsToMPI *merge;
47 
48   PetscFunctionBegin;
49   PetscCall(PetscFree2(ptap->startsj_s,ptap->startsj_r));
50   PetscCall(PetscFree(ptap->bufa));
51   PetscCall(MatDestroy(&ptap->P_loc));
52   PetscCall(MatDestroy(&ptap->P_oth));
53   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
54   PetscCall(MatDestroy(&ptap->Rd));
55   PetscCall(MatDestroy(&ptap->Ro));
56   if (ptap->AP_loc) { /* used by alg_rap */
57     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
58     PetscCall(PetscFree(ap->i));
59     PetscCall(PetscFree2(ap->j,ap->a));
60     PetscCall(MatDestroy(&ptap->AP_loc));
61   } else { /* used by alg_ptap */
62     PetscCall(PetscFree(ptap->api));
63     PetscCall(PetscFree(ptap->apj));
64   }
65   PetscCall(MatDestroy(&ptap->C_loc));
66   PetscCall(MatDestroy(&ptap->C_oth));
67   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68 
69   PetscCall(MatDestroy(&ptap->Pt));
70 
71   merge = ptap->merge;
72   if (merge) { /* used by alg_ptap */
73     PetscCall(PetscFree(merge->id_r));
74     PetscCall(PetscFree(merge->len_s));
75     PetscCall(PetscFree(merge->len_r));
76     PetscCall(PetscFree(merge->bi));
77     PetscCall(PetscFree(merge->bj));
78     PetscCall(PetscFree(merge->buf_ri[0]));
79     PetscCall(PetscFree(merge->buf_ri));
80     PetscCall(PetscFree(merge->buf_rj[0]));
81     PetscCall(PetscFree(merge->buf_rj));
82     PetscCall(PetscFree(merge->coi));
83     PetscCall(PetscFree(merge->coj));
84     PetscCall(PetscFree(merge->owners_co));
85     PetscCall(PetscLayoutDestroy(&merge->rowmap));
86     PetscCall(PetscFree(ptap->merge));
87   }
88   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
89 
90   PetscCall(PetscSFDestroy(&ptap->sf));
91   PetscCall(PetscFree(ptap->c_othi));
92   PetscCall(PetscFree(ptap->c_rmti));
93   PetscCall(PetscFree(ptap));
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
98 {
99   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
100   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
101   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
102   Mat_APMPI         *ptap;
103   Mat               AP_loc,C_loc,C_oth;
104   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
105   PetscScalar       *apa;
106   const PetscInt    *cols;
107   const PetscScalar *vals;
108 
109   PetscFunctionBegin;
110   MatCheckProduct(C,3);
111   ptap = (Mat_APMPI*)C->product->data;
112   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
113   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
114 
115   PetscCall(MatZeroEntries(C));
116 
117   /* 1) get R = Pd^T,Ro = Po^T */
118   if (ptap->reuse == MAT_REUSE_MATRIX) {
119     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
120     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
121   }
122 
123   /* 2) get AP_loc */
124   AP_loc = ptap->AP_loc;
125   ap = (Mat_SeqAIJ*)AP_loc->data;
126 
127   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128   /*-----------------------------------------------------*/
129   if (ptap->reuse == MAT_REUSE_MATRIX) {
130     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
131     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
132     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
133   }
134 
135   /* 2-2) compute numeric A_loc*P - dominating part */
136   /* ---------------------------------------------- */
137   /* get data from symbolic products */
138   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
139   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
140 
141   api   = ap->i;
142   apj   = ap->j;
143   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj));
144   for (i=0; i<am; i++) {
145     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146     apnz = api[i+1] - api[i];
147     apa = ap->a + api[i];
148     PetscCall(PetscArrayzero(apa,apnz));
149     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
150   }
151   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj));
152   PetscCheck(api[AP_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout);
153 
154   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155   /* Always use scalable version since we are in the MPI scalable version */
156   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc));
157   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth));
158 
159   C_loc = ptap->C_loc;
160   C_oth = ptap->C_oth;
161 
162   /* add C_loc and Co to to C */
163   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
164 
165   /* C_loc -> C */
166   cm    = C_loc->rmap->N;
167   c_seq = (Mat_SeqAIJ*)C_loc->data;
168   cols = c_seq->j;
169   vals = c_seq->a;
170   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j));
171 
172   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173   /* when there are no off-processor parts.  */
174   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
175   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
176   /* a table, and other, more complex stuff has to be done. */
177   if (C->assembled) {
178     C->was_assembled = PETSC_TRUE;
179     C->assembled     = PETSC_FALSE;
180   }
181   if (C->was_assembled) {
182     for (i=0; i<cm; i++) {
183       ncols = c_seq->i[i+1] - c_seq->i[i];
184       row = rstart + i;
185       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
186       cols += ncols; vals += ncols;
187     }
188   } else {
189     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
190   }
191   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j));
192   PetscCheck(c_seq->i[C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
193 
194   /* Co -> C, off-processor part */
195   cm = C_oth->rmap->N;
196   c_seq = (Mat_SeqAIJ*)C_oth->data;
197   cols = c_seq->j;
198   vals = c_seq->a;
199   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j));
200   for (i=0; i<cm; i++) {
201     ncols = c_seq->i[i+1] - c_seq->i[i];
202     row = p->garray[i];
203     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
204     cols += ncols; vals += ncols;
205   }
206   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
207   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
208 
209   ptap->reuse = MAT_REUSE_MATRIX;
210 
211   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j));
212   PetscCheck(c_seq->i[C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
217 {
218   Mat_APMPI           *ptap;
219   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
220   MPI_Comm            comm;
221   PetscMPIInt         size,rank;
222   Mat                 P_loc,P_oth;
223   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
224   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
225   PetscInt            *lnk,i,k,pnz,row,nsend;
226   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
227   PETSC_UNUSED PetscMPIInt icompleted=0;
228   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
229   const PetscInt      *owners;
230   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
231   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
232   MPI_Request         *swaits,*rwaits;
233   MPI_Status          *sstatus,rstatus;
234   PetscLayout         rowmap;
235   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
236   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
237   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
238   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
239   PetscScalar         *apv;
240   PetscTable          ta;
241   MatType             mtype;
242   const char          *prefix;
243 #if defined(PETSC_USE_INFO)
244   PetscReal           apfill;
245 #endif
246 
247   PetscFunctionBegin;
248   MatCheckProduct(Cmpi,4);
249   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
250   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
251   PetscCallMPI(MPI_Comm_size(comm,&size));
252   PetscCallMPI(MPI_Comm_rank(comm,&rank));
253 
254   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
255 
256   /* create symbolic parallel matrix Cmpi */
257   PetscCall(MatGetType(A,&mtype));
258   PetscCall(MatSetType(Cmpi,mtype));
259 
260   /* create struct Mat_APMPI and attached it to C later */
261   PetscCall(PetscNew(&ptap));
262   ptap->reuse = MAT_INITIAL_MATRIX;
263   ptap->algType = 0;
264 
265   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
266   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth));
267   /* get P_loc by taking all local rows of P */
268   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc));
269 
270   ptap->P_loc = P_loc;
271   ptap->P_oth = P_oth;
272 
273   /* (0) compute Rd = Pd^T, Ro = Po^T  */
274   /* --------------------------------- */
275   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
276   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
277 
278   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
279   /* ----------------------------------------------------------------- */
280   p_loc  = (Mat_SeqAIJ*)P_loc->data;
281   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
282 
283   /* create and initialize a linked list */
284   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
285   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
286   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
287   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
288 
289   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
290 
291   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
292   if (ao) {
293     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
294   } else {
295     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
296   }
297   current_space = free_space;
298   nspacedouble  = 0;
299 
300   PetscCall(PetscMalloc1(am+1,&api));
301   api[0] = 0;
302   for (i=0; i<am; i++) {
303     /* diagonal portion: Ad[i,:]*P */
304     ai = ad->i; pi = p_loc->i;
305     nzi = ai[i+1] - ai[i];
306     aj  = ad->j + ai[i];
307     for (j=0; j<nzi; j++) {
308       row  = aj[j];
309       pnz  = pi[row+1] - pi[row];
310       Jptr = p_loc->j + pi[row];
311       /* add non-zero cols of P into the sorted linked list lnk */
312       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
313     }
314     /* off-diagonal portion: Ao[i,:]*P */
315     if (ao) {
316       ai = ao->i; pi = p_oth->i;
317       nzi = ai[i+1] - ai[i];
318       aj  = ao->j + ai[i];
319       for (j=0; j<nzi; j++) {
320         row  = aj[j];
321         pnz  = pi[row+1] - pi[row];
322         Jptr = p_oth->j + pi[row];
323         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
324       }
325     }
326     apnz     = lnk[0];
327     api[i+1] = api[i] + apnz;
328 
329     /* if free space is not available, double the total space in the list */
330     if (current_space->local_remaining<apnz) {
331       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
332       nspacedouble++;
333     }
334 
335     /* Copy data into free space, then initialize lnk */
336     PetscCall(PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk));
337 
338     current_space->array           += apnz;
339     current_space->local_used      += apnz;
340     current_space->local_remaining -= apnz;
341   }
342   /* Allocate space for apj and apv, initialize apj, and */
343   /* destroy list of free space and other temporary array(s) */
344   PetscCall(PetscCalloc2(api[am],&apj,api[am],&apv));
345   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
346   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347 
348   /* Create AP_loc for reuse */
349   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
350   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351 
352 #if defined(PETSC_USE_INFO)
353   if (ao) {
354     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
355   } else {
356     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
357   }
358   ptap->AP_loc->info.mallocs           = nspacedouble;
359   ptap->AP_loc->info.fill_ratio_given  = fill;
360   ptap->AP_loc->info.fill_ratio_needed = apfill;
361 
362   if (api[am]) {
363     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
364     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
365   } else {
366     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n"));
367   }
368 #endif
369 
370   /* (2-1) compute symbolic Co = Ro*AP_loc  */
371   /* -------------------------------------- */
372   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
373   PetscCall(MatGetOptionsPrefix(A,&prefix));
374   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
375   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_"));
376 
377   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
378   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"sorted"));
379   PetscCall(MatProductSetFill(ptap->C_oth,fill));
380   PetscCall(MatProductSetFromOptions(ptap->C_oth));
381   PetscCall(MatProductSymbolic(ptap->C_oth));
382 
383   /* (3) send coj of C_oth to other processors  */
384   /* ------------------------------------------ */
385   /* determine row ownership */
386   PetscCall(PetscLayoutCreate(comm,&rowmap));
387   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
388   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
389   PetscCall(PetscLayoutSetUp(rowmap));
390   PetscCall(PetscLayoutGetRanges(rowmap,&owners));
391 
392   /* determine the number of messages to send, their lengths */
393   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
394   PetscCall(PetscArrayzero(len_s,size));
395   PetscCall(PetscArrayzero(len_si,size));
396 
397   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
398   coi   = c_oth->i; coj = c_oth->j;
399   con   = ptap->C_oth->rmap->n;
400   proc  = 0;
401   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj));
402   for (i=0; i<con; i++) {
403     while (prmap[i] >= owners[proc+1]) proc++;
404     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
405     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
406   }
407 
408   len          = 0; /* max length of buf_si[], see (4) */
409   owners_co[0] = 0;
410   nsend        = 0;
411   for (proc=0; proc<size; proc++) {
412     owners_co[proc+1] = owners_co[proc] + len_si[proc];
413     if (len_s[proc]) {
414       nsend++;
415       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
416       len         += len_si[proc];
417     }
418   }
419 
420   /* determine the number and length of messages to receive for coi and coj  */
421   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
422   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
423 
424   /* post the Irecv and Isend of coj */
425   PetscCall(PetscCommGetNewTag(comm,&tagj));
426   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
427   PetscCall(PetscMalloc1(nsend+1,&swaits));
428   for (proc=0, k=0; proc<size; proc++) {
429     if (!len_s[proc]) continue;
430     i    = owners_co[proc];
431     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
432     k++;
433   }
434 
435   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
436   /* ---------------------------------------- */
437   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
438   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
439   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
440   PetscCall(MatProductSetFill(ptap->C_loc,fill));
441 
442   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
443   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_"));
444 
445   PetscCall(MatProductSetFromOptions(ptap->C_loc));
446   PetscCall(MatProductSymbolic(ptap->C_loc));
447 
448   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
449   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j));
450 
451   /* receives coj are complete */
452   for (i=0; i<nrecv; i++) {
453     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
454   }
455   PetscCall(PetscFree(rwaits));
456   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
457 
458   /* add received column indices into ta to update Crmax */
459   for (k=0; k<nrecv; k++) {/* k-th received message */
460     Jptr = buf_rj[k];
461     for (j=0; j<len_r[k]; j++) {
462       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
463     }
464   }
465   PetscCall(PetscTableGetCount(ta,&Crmax));
466   PetscCall(PetscTableDestroy(&ta));
467 
468   /* (4) send and recv coi */
469   /*-----------------------*/
470   PetscCall(PetscCommGetNewTag(comm,&tagi));
471   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
472   PetscCall(PetscMalloc1(len+1,&buf_s));
473   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
474   for (proc=0,k=0; proc<size; proc++) {
475     if (!len_s[proc]) continue;
476     /* form outgoing message for i-structure:
477          buf_si[0]:                 nrows to be sent
478                [1:nrows]:           row index (global)
479                [nrows+1:2*nrows+1]: i-structure index
480     */
481     /*-------------------------------------------*/
482     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
483     buf_si_i    = buf_si + nrows+1;
484     buf_si[0]   = nrows;
485     buf_si_i[0] = 0;
486     nrows       = 0;
487     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
488       nzi = coi[i+1] - coi[i];
489       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
490       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
491       nrows++;
492     }
493     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
494     k++;
495     buf_si += len_si[proc];
496   }
497   for (i=0; i<nrecv; i++) {
498     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
499   }
500   PetscCall(PetscFree(rwaits));
501   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
502 
503   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
504   PetscCall(PetscFree(len_ri));
505   PetscCall(PetscFree(swaits));
506   PetscCall(PetscFree(buf_s));
507 
508   /* (5) compute the local portion of Cmpi      */
509   /* ------------------------------------------ */
510   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
511   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
512   current_space = free_space;
513 
514   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
515   for (k=0; k<nrecv; k++) {
516     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
517     nrows       = *buf_ri_k[k];
518     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
519     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
520   }
521 
522   MatPreallocateBegin(comm,pn,pn,dnz,onz);
523   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
524   for (i=0; i<pn; i++) {
525     /* add C_loc into Cmpi */
526     nzi  = c_loc->i[i+1] - c_loc->i[i];
527     Jptr = c_loc->j + c_loc->i[i];
528     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
529 
530     /* add received col data into lnk */
531     for (k=0; k<nrecv; k++) { /* k-th received message */
532       if (i == *nextrow[k]) { /* i-th row */
533         nzi  = *(nextci[k]+1) - *nextci[k];
534         Jptr = buf_rj[k] + *nextci[k];
535         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
536         nextrow[k]++; nextci[k]++;
537       }
538     }
539     nzi = lnk[0];
540 
541     /* copy data into free space, then initialize lnk */
542     PetscCall(PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk));
543     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
544   }
545   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
546   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
547   PetscCall(PetscFreeSpaceDestroy(free_space));
548 
549   /* local sizes and preallocation */
550   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
551   if (P->cmap->bs > 0) {
552     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
553     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
554   }
555   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
556   MatPreallocateEnd(dnz,onz);
557 
558   /* members in merge */
559   PetscCall(PetscFree(id_r));
560   PetscCall(PetscFree(len_r));
561   PetscCall(PetscFree(buf_ri[0]));
562   PetscCall(PetscFree(buf_ri));
563   PetscCall(PetscFree(buf_rj[0]));
564   PetscCall(PetscFree(buf_rj));
565   PetscCall(PetscLayoutDestroy(&rowmap));
566 
567   nout = 0;
568   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j));
569   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout);
570   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j));
571   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout);
572 
573   /* attach the supporting struct to Cmpi for reuse */
574   Cmpi->product->data    = ptap;
575   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
576   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
577 
578   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
579   Cmpi->assembled        = PETSC_FALSE;
580   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
581   PetscFunctionReturn(0);
582 }
583 
584 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
585 {
586   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
587   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
588   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
589   PetscInt             pcstart,pcend,column,offset;
590 
591   PetscFunctionBegin;
592   pcstart = P->cmap->rstart;
593   pcstart *= dof;
594   pcend   = P->cmap->rend;
595   pcend   *= dof;
596   /* diagonal portion: Ad[i,:]*P */
597   ai = ad->i;
598   nzi = ai[i+1] - ai[i];
599   aj  = ad->j + ai[i];
600   for (j=0; j<nzi; j++) {
601     row  = aj[j];
602     offset = row%dof;
603     row   /= dof;
604     nzpi = pd->i[row+1] - pd->i[row];
605     pj  = pd->j + pd->i[row];
606     for (k=0; k<nzpi; k++) {
607       PetscCall(PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart));
608     }
609   }
610   /* off diag P */
611   for (j=0; j<nzi; j++) {
612     row  = aj[j];
613     offset = row%dof;
614     row   /= dof;
615     nzpi = po->i[row+1] - po->i[row];
616     pj  = po->j + po->i[row];
617     for (k=0; k<nzpi; k++) {
618       PetscCall(PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset));
619     }
620   }
621 
622   /* off diagonal part: Ao[i, :]*P_oth */
623   if (ao) {
624     ai = ao->i;
625     pi = p_oth->i;
626     nzi = ai[i+1] - ai[i];
627     aj  = ao->j + ai[i];
628     for (j=0; j<nzi; j++) {
629       row  = aj[j];
630       offset = a->garray[row]%dof;
631       row  = map[row];
632       pnz  = pi[row+1] - pi[row];
633       p_othcols = p_oth->j + pi[row];
634       for (col=0; col<pnz; col++) {
635         column = p_othcols[col] * dof + offset;
636         if (column>=pcstart && column<pcend) {
637           PetscCall(PetscHSetIAdd(dht,column));
638         } else {
639           PetscCall(PetscHSetIAdd(oht,column));
640         }
641       }
642     }
643   } /* end if (ao) */
644   PetscFunctionReturn(0);
645 }
646 
647 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
648 {
649   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
650   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
651   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
652   PetscScalar          ra,*aa,*pa;
653 
654   PetscFunctionBegin;
655   pcstart = P->cmap->rstart;
656   pcstart *= dof;
657 
658   /* diagonal portion: Ad[i,:]*P */
659   ai  = ad->i;
660   nzi = ai[i+1] - ai[i];
661   aj  = ad->j + ai[i];
662   aa  = ad->a + ai[i];
663   for (j=0; j<nzi; j++) {
664     ra   = aa[j];
665     row  = aj[j];
666     offset = row%dof;
667     row    /= dof;
668     nzpi = pd->i[row+1] - pd->i[row];
669     pj = pd->j + pd->i[row];
670     pa = pd->a + pd->i[row];
671     for (k=0; k<nzpi; k++) {
672       PetscCall(PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]));
673     }
674     PetscCall(PetscLogFlops(2.0*nzpi));
675   }
676   for (j=0; j<nzi; j++) {
677     ra   = aa[j];
678     row  = aj[j];
679     offset = row%dof;
680     row   /= dof;
681     nzpi = po->i[row+1] - po->i[row];
682     pj = po->j + po->i[row];
683     pa = po->a + po->i[row];
684     for (k=0; k<nzpi; k++) {
685       PetscCall(PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]));
686     }
687     PetscCall(PetscLogFlops(2.0*nzpi));
688   }
689 
690   /* off diagonal part: Ao[i, :]*P_oth */
691   if (ao) {
692     ai = ao->i;
693     pi = p_oth->i;
694     nzi = ai[i+1] - ai[i];
695     aj  = ao->j + ai[i];
696     aa  = ao->a + ai[i];
697     for (j=0; j<nzi; j++) {
698       row  = aj[j];
699       offset = a->garray[row]%dof;
700       row    = map[row];
701       ra   = aa[j];
702       pnz  = pi[row+1] - pi[row];
703       p_othcols = p_oth->j + pi[row];
704       pa   = p_oth->a + pi[row];
705       for (col=0; col<pnz; col++) {
706         PetscCall(PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]));
707       }
708       PetscCall(PetscLogFlops(2.0*pnz));
709     }
710   } /* end if (ao) */
711 
712   PetscFunctionReturn(0);
713 }
714 
715 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
716 
717 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
718 {
719   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
720   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
721   Mat_APMPI         *ptap;
722   PetscHMapIV       hmap;
723   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
724   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
725   PetscInt          offset,ii,pocol;
726   const PetscInt    *mappingindices;
727   IS                map;
728 
729   PetscFunctionBegin;
730   MatCheckProduct(C,4);
731   ptap = (Mat_APMPI*)C->product->data;
732   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
733   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
734 
735   PetscCall(MatZeroEntries(C));
736 
737   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
738   /*-----------------------------------------------------*/
739   if (ptap->reuse == MAT_REUSE_MATRIX) {
740     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
741     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
742   }
743   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
744 
745   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
746   pon *= dof;
747   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
748   PetscCall(MatGetLocalSize(A,&am,NULL));
749   cmaxr = 0;
750   for (i=0; i<pon; i++) {
751     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
752   }
753   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
754   PetscCall(PetscHMapIVCreate(&hmap));
755   PetscCall(PetscHMapIVResize(hmap,cmaxr));
756   PetscCall(ISGetIndices(map,&mappingindices));
757   for (i=0; i<am && pon; i++) {
758     PetscCall(PetscHMapIVClear(hmap));
759     offset = i%dof;
760     ii     = i/dof;
761     nzi = po->i[ii+1] - po->i[ii];
762     if (!nzi) continue;
763     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
764     voff = 0;
765     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
766     if (!voff) continue;
767 
768     /* Form C(ii, :) */
769     poj = po->j + po->i[ii];
770     poa = po->a + po->i[ii];
771     for (j=0; j<nzi; j++) {
772       pocol = poj[j]*dof+offset;
773       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
774       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
775       for (jj=0; jj<voff; jj++) {
776         apvaluestmp[jj] = apvalues[jj]*poa[j];
777         /*If the row is empty */
778         if (!c_rmtc[pocol]) {
779           c_rmtjj[jj] = apindices[jj];
780           c_rmtaa[jj] = apvaluestmp[jj];
781           c_rmtc[pocol]++;
782         } else {
783           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
784           if (loc>=0){ /* hit */
785             c_rmtaa[loc] += apvaluestmp[jj];
786             PetscCall(PetscLogFlops(1.0));
787           } else { /* new element */
788             loc = -(loc+1);
789             /* Move data backward */
790             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
791               c_rmtjj[kk] = c_rmtjj[kk-1];
792               c_rmtaa[kk] = c_rmtaa[kk-1];
793             }/* End kk */
794             c_rmtjj[loc] = apindices[jj];
795             c_rmtaa[loc] = apvaluestmp[jj];
796             c_rmtc[pocol]++;
797           }
798         }
799         PetscCall(PetscLogFlops(voff));
800       } /* End jj */
801     } /* End j */
802   } /* End i */
803 
804   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
805   PetscCall(PetscHMapIVDestroy(&hmap));
806 
807   PetscCall(MatGetLocalSize(P,NULL,&pn));
808   pn *= dof;
809   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
810 
811   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
812   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
813   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
814   pcstart = pcstart*dof;
815   pcend   = pcend*dof;
816   cd = (Mat_SeqAIJ*)(c->A)->data;
817   co = (Mat_SeqAIJ*)(c->B)->data;
818 
819   cmaxr = 0;
820   for (i=0; i<pn; i++) {
821     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
822   }
823   PetscCall(PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ));
824   PetscCall(PetscHMapIVCreate(&hmap));
825   PetscCall(PetscHMapIVResize(hmap,cmaxr));
826   for (i=0; i<am && pn; i++) {
827     PetscCall(PetscHMapIVClear(hmap));
828     offset = i%dof;
829     ii     = i/dof;
830     nzi = pd->i[ii+1] - pd->i[ii];
831     if (!nzi) continue;
832     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
833     voff = 0;
834     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
835     if (!voff) continue;
836     /* Form C(ii, :) */
837     pdj = pd->j + pd->i[ii];
838     pda = pd->a + pd->i[ii];
839     for (j=0; j<nzi; j++) {
840       row = pcstart + pdj[j] * dof + offset;
841       for (jj=0; jj<voff; jj++) {
842         apvaluestmp[jj] = apvalues[jj]*pda[j];
843       }
844       PetscCall(PetscLogFlops(voff));
845       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
846     }
847   }
848   PetscCall(ISRestoreIndices(map,&mappingindices));
849   PetscCall(MatGetOwnershipRangeColumn(C,&ccstart,&ccend));
850   PetscCall(PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ));
851   PetscCall(PetscHMapIVDestroy(&hmap));
852   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
853   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
854   PetscCall(PetscFree2(c_rmtj,c_rmta));
855 
856   /* Add contributions from remote */
857   for (i = 0; i < pn; i++) {
858     row = i + pcstart;
859     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
860   }
861   PetscCall(PetscFree2(c_othj,c_otha));
862 
863   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
864   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
865 
866   ptap->reuse = MAT_REUSE_MATRIX;
867   PetscFunctionReturn(0);
868 }
869 
870 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
871 {
872   PetscFunctionBegin;
873 
874   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C));
875   PetscFunctionReturn(0);
876 }
877 
878 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
879 {
880   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
881   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
882   Mat_APMPI         *ptap;
883   PetscHMapIV       hmap;
884   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
885   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
886   PetscInt          offset,ii,pocol;
887   const PetscInt    *mappingindices;
888   IS                map;
889 
890   PetscFunctionBegin;
891   MatCheckProduct(C,4);
892   ptap = (Mat_APMPI*)C->product->data;
893   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
894   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
895 
896   PetscCall(MatZeroEntries(C));
897 
898   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
899   /*-----------------------------------------------------*/
900   if (ptap->reuse == MAT_REUSE_MATRIX) {
901     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
902     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
903   }
904   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
905   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
906   pon *= dof;
907   PetscCall(MatGetLocalSize(P,NULL,&pn));
908   pn  *= dof;
909 
910   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
911   PetscCall(MatGetLocalSize(A,&am,NULL));
912   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
913   pcstart *= dof;
914   pcend   *= dof;
915   cmaxr = 0;
916   for (i=0; i<pon; i++) {
917     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
918   }
919   cd = (Mat_SeqAIJ*)(c->A)->data;
920   co = (Mat_SeqAIJ*)(c->B)->data;
921   for (i=0; i<pn; i++) {
922     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
923   }
924   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
925   PetscCall(PetscHMapIVCreate(&hmap));
926   PetscCall(PetscHMapIVResize(hmap,cmaxr));
927   PetscCall(ISGetIndices(map,&mappingindices));
928   for (i=0; i<am && (pon || pn); i++) {
929     PetscCall(PetscHMapIVClear(hmap));
930     offset = i%dof;
931     ii     = i/dof;
932     nzi  = po->i[ii+1] - po->i[ii];
933     dnzi = pd->i[ii+1] - pd->i[ii];
934     if (!nzi && !dnzi) continue;
935     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
936     voff = 0;
937     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
938     if (!voff) continue;
939 
940     /* Form remote C(ii, :) */
941     poj = po->j + po->i[ii];
942     poa = po->a + po->i[ii];
943     for (j=0; j<nzi; j++) {
944       pocol = poj[j]*dof+offset;
945       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
946       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
947       for (jj=0; jj<voff; jj++) {
948         apvaluestmp[jj] = apvalues[jj]*poa[j];
949         /*If the row is empty */
950         if (!c_rmtc[pocol]) {
951           c_rmtjj[jj] = apindices[jj];
952           c_rmtaa[jj] = apvaluestmp[jj];
953           c_rmtc[pocol]++;
954         } else {
955           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
956           if (loc>=0){ /* hit */
957             c_rmtaa[loc] += apvaluestmp[jj];
958             PetscCall(PetscLogFlops(1.0));
959           } else { /* new element */
960             loc = -(loc+1);
961             /* Move data backward */
962             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
963               c_rmtjj[kk] = c_rmtjj[kk-1];
964               c_rmtaa[kk] = c_rmtaa[kk-1];
965             }/* End kk */
966             c_rmtjj[loc] = apindices[jj];
967             c_rmtaa[loc] = apvaluestmp[jj];
968             c_rmtc[pocol]++;
969           }
970         }
971       } /* End jj */
972       PetscCall(PetscLogFlops(voff));
973     } /* End j */
974 
975     /* Form local C(ii, :) */
976     pdj = pd->j + pd->i[ii];
977     pda = pd->a + pd->i[ii];
978     for (j=0; j<dnzi; j++) {
979       row = pcstart + pdj[j] * dof + offset;
980       for (jj=0; jj<voff; jj++) {
981         apvaluestmp[jj] = apvalues[jj]*pda[j];
982       }/* End kk */
983       PetscCall(PetscLogFlops(voff));
984       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
985     }/* End j */
986   } /* End i */
987 
988   PetscCall(ISRestoreIndices(map,&mappingindices));
989   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
990   PetscCall(PetscHMapIVDestroy(&hmap));
991   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
992 
993   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
994   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
995   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
996   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
997   PetscCall(PetscFree2(c_rmtj,c_rmta));
998 
999   /* Add contributions from remote */
1000   for (i = 0; i < pn; i++) {
1001     row = i + pcstart;
1002     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
1003   }
1004   PetscCall(PetscFree2(c_othj,c_otha));
1005 
1006   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1007   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1008 
1009   ptap->reuse = MAT_REUSE_MATRIX;
1010   PetscFunctionReturn(0);
1011 }
1012 
1013 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1014 {
1015   PetscFunctionBegin;
1016 
1017   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C));
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 /* TODO: move algorithm selection to MatProductSetFromOptions */
1022 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1023 {
1024   Mat_APMPI           *ptap;
1025   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1026   MPI_Comm            comm;
1027   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1028   MatType             mtype;
1029   PetscSF             sf;
1030   PetscSFNode         *iremote;
1031   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1032   const PetscInt      *rootdegrees;
1033   PetscHSetI          ht,oht,*hta,*hto;
1034   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1035   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1036   PetscInt            nalg=2,alg=0,offset,ii;
1037   PetscMPIInt         owner;
1038   const PetscInt      *mappingindices;
1039   PetscBool           flg;
1040   const char          *algTypes[2] = {"overlapping","merged"};
1041   IS                  map;
1042 
1043   PetscFunctionBegin;
1044   MatCheckProduct(Cmpi,5);
1045   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1046   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1047 
1048   /* Create symbolic parallel matrix Cmpi */
1049   PetscCall(MatGetLocalSize(P,NULL,&pn));
1050   pn *= dof;
1051   PetscCall(MatGetType(A,&mtype));
1052   PetscCall(MatSetType(Cmpi,mtype));
1053   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1054 
1055   PetscCall(PetscNew(&ptap));
1056   ptap->reuse = MAT_INITIAL_MATRIX;
1057   ptap->algType = 2;
1058 
1059   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1060   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
1061   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
1062   /* This equals to the number of offdiag columns in P */
1063   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1064   pon *= dof;
1065   /* offsets */
1066   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
1067   /* The number of columns we will send to remote ranks */
1068   PetscCall(PetscMalloc1(pon,&c_rmtc));
1069   PetscCall(PetscMalloc1(pon,&hta));
1070   for (i=0; i<pon; i++) {
1071     PetscCall(PetscHSetICreate(&hta[i]));
1072   }
1073   PetscCall(MatGetLocalSize(A,&am,NULL));
1074   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
1075   /* Create hash table to merge all columns for C(i, :) */
1076   PetscCall(PetscHSetICreate(&ht));
1077 
1078   PetscCall(ISGetIndices(map,&mappingindices));
1079   ptap->c_rmti[0] = 0;
1080   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1081   for (i=0; i<am && pon; i++) {
1082     /* Form one row of AP */
1083     PetscCall(PetscHSetIClear(ht));
1084     offset = i%dof;
1085     ii     = i/dof;
1086     /* If the off diag is empty, we should not do any calculation */
1087     nzi = po->i[ii+1] - po->i[ii];
1088     if (!nzi) continue;
1089 
1090     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht));
1091     PetscCall(PetscHSetIGetSize(ht,&htsize));
1092     /* If AP is empty, just continue */
1093     if (!htsize) continue;
1094     /* Form C(ii, :) */
1095     poj = po->j + po->i[ii];
1096     for (j=0; j<nzi; j++) {
1097       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
1098     }
1099   }
1100 
1101   for (i=0; i<pon; i++) {
1102     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1103     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1104     c_rmtc[i] = htsize;
1105   }
1106 
1107   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
1108 
1109   for (i=0; i<pon; i++) {
1110     off = 0;
1111     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
1112     PetscCall(PetscHSetIDestroy(&hta[i]));
1113   }
1114   PetscCall(PetscFree(hta));
1115 
1116   PetscCall(PetscMalloc1(pon,&iremote));
1117   for (i=0; i<pon; i++) {
1118     owner = 0; lidx = 0;
1119     offset = i%dof;
1120     ii     = i/dof;
1121     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1122     iremote[i].index = lidx*dof + offset;
1123     iremote[i].rank  = owner;
1124   }
1125 
1126   PetscCall(PetscSFCreate(comm,&sf));
1127   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1128   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1129   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
1130   PetscCall(PetscSFSetFromOptions(sf));
1131   PetscCall(PetscSFSetUp(sf));
1132   /* How many neighbors have contributions to my rows? */
1133   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
1134   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
1135   rootspacesize = 0;
1136   for (i = 0; i < pn; i++) {
1137     rootspacesize += rootdegrees[i];
1138   }
1139   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
1140   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
1141   /* Get information from leaves
1142    * Number of columns other people contribute to my rows
1143    * */
1144   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
1145   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
1146   PetscCall(PetscFree(c_rmtc));
1147   PetscCall(PetscCalloc1(pn+1,&ptap->c_othi));
1148   /* The number of columns is received for each row */
1149   ptap->c_othi[0] = 0;
1150   rootspacesize = 0;
1151   rootspaceoffsets[0] = 0;
1152   for (i = 0; i < pn; i++) {
1153     rcvncols = 0;
1154     for (j = 0; j<rootdegrees[i]; j++) {
1155       rcvncols += rootspace[rootspacesize];
1156       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1157       rootspacesize++;
1158     }
1159     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1160   }
1161   PetscCall(PetscFree(rootspace));
1162 
1163   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
1164   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1165   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1166   PetscCall(PetscSFDestroy(&sf));
1167   PetscCall(PetscFree(rootspaceoffsets));
1168 
1169   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
1170   nleaves = 0;
1171   for (i = 0; i<pon; i++) {
1172     owner = 0;
1173     ii = i/dof;
1174     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
1175     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1176     for (j=0; j<sendncols; j++) {
1177       iremote[nleaves].rank = owner;
1178       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1179     }
1180   }
1181   PetscCall(PetscFree(c_rmtoffsets));
1182   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
1183 
1184   PetscCall(PetscSFCreate(comm,&ptap->sf));
1185   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1186   PetscCall(PetscSFSetFromOptions(ptap->sf));
1187   /* One to one map */
1188   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1189 
1190   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
1191   PetscCall(PetscHSetICreate(&oht));
1192   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1193   pcstart *= dof;
1194   pcend   *= dof;
1195   PetscCall(PetscMalloc2(pn,&hta,pn,&hto));
1196   for (i=0; i<pn; i++) {
1197     PetscCall(PetscHSetICreate(&hta[i]));
1198     PetscCall(PetscHSetICreate(&hto[i]));
1199   }
1200   /* Work on local part */
1201   /* 4) Pass 1: Estimate memory for C_loc */
1202   for (i=0; i<am && pn; i++) {
1203     PetscCall(PetscHSetIClear(ht));
1204     PetscCall(PetscHSetIClear(oht));
1205     offset = i%dof;
1206     ii     = i/dof;
1207     nzi = pd->i[ii+1] - pd->i[ii];
1208     if (!nzi) continue;
1209 
1210     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
1211     PetscCall(PetscHSetIGetSize(ht,&htsize));
1212     PetscCall(PetscHSetIGetSize(oht,&htosize));
1213     if (!(htsize+htosize)) continue;
1214     /* Form C(ii, :) */
1215     pdj = pd->j + pd->i[ii];
1216     for (j=0; j<nzi; j++) {
1217       PetscCall(PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht));
1218       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
1219     }
1220   }
1221 
1222   PetscCall(ISRestoreIndices(map,&mappingindices));
1223 
1224   PetscCall(PetscHSetIDestroy(&ht));
1225   PetscCall(PetscHSetIDestroy(&oht));
1226 
1227   /* Get remote data */
1228   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1229   PetscCall(PetscFree(c_rmtj));
1230 
1231   for (i = 0; i < pn; i++) {
1232     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1233     rdj = c_othj + ptap->c_othi[i];
1234     for (j = 0; j < nzi; j++) {
1235       col = rdj[j];
1236       /* diag part */
1237       if (col>=pcstart && col<pcend) {
1238         PetscCall(PetscHSetIAdd(hta[i],col));
1239       } else { /* off diag */
1240         PetscCall(PetscHSetIAdd(hto[i],col));
1241       }
1242     }
1243     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1244     dnz[i] = htsize;
1245     PetscCall(PetscHSetIDestroy(&hta[i]));
1246     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
1247     onz[i] = htsize;
1248     PetscCall(PetscHSetIDestroy(&hto[i]));
1249   }
1250 
1251   PetscCall(PetscFree2(hta,hto));
1252   PetscCall(PetscFree(c_othj));
1253 
1254   /* local sizes and preallocation */
1255   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1256   PetscCall(MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
1257   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1258   PetscCall(MatSetUp(Cmpi));
1259   PetscCall(PetscFree2(dnz,onz));
1260 
1261   /* attach the supporting struct to Cmpi for reuse */
1262   Cmpi->product->data    = ptap;
1263   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1264   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1265 
1266   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1267   Cmpi->assembled = PETSC_FALSE;
1268   /* pick an algorithm */
1269   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1270   alg = 0;
1271   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1272   PetscOptionsEnd();
1273   switch (alg) {
1274     case 0:
1275       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1276       break;
1277     case 1:
1278       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1279       break;
1280     default:
1281       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1282   }
1283   PetscFunctionReturn(0);
1284 }
1285 
1286 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1287 {
1288   PetscFunctionBegin;
1289   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C));
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1294 {
1295   Mat_APMPI           *ptap;
1296   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1297   MPI_Comm            comm;
1298   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1299   MatType             mtype;
1300   PetscSF             sf;
1301   PetscSFNode         *iremote;
1302   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1303   const PetscInt      *rootdegrees;
1304   PetscHSetI          ht,oht,*hta,*hto,*htd;
1305   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1306   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1307   PetscInt            nalg=2,alg=0,offset,ii;
1308   PetscMPIInt         owner;
1309   PetscBool           flg;
1310   const char          *algTypes[2] = {"merged","overlapping"};
1311   const PetscInt      *mappingindices;
1312   IS                  map;
1313 
1314   PetscFunctionBegin;
1315   MatCheckProduct(Cmpi,5);
1316   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1317   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1318 
1319   /* Create symbolic parallel matrix Cmpi */
1320   PetscCall(MatGetLocalSize(P,NULL,&pn));
1321   pn *= dof;
1322   PetscCall(MatGetType(A,&mtype));
1323   PetscCall(MatSetType(Cmpi,mtype));
1324   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1325 
1326   PetscCall(PetscNew(&ptap));
1327   ptap->reuse = MAT_INITIAL_MATRIX;
1328   ptap->algType = 3;
1329 
1330   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1331   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
1332   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
1333 
1334   /* This equals to the number of offdiag columns in P */
1335   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1336   pon *= dof;
1337   /* offsets */
1338   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
1339   /* The number of columns we will send to remote ranks */
1340   PetscCall(PetscMalloc1(pon,&c_rmtc));
1341   PetscCall(PetscMalloc1(pon,&hta));
1342   for (i=0; i<pon; i++) {
1343     PetscCall(PetscHSetICreate(&hta[i]));
1344   }
1345   PetscCall(MatGetLocalSize(A,&am,NULL));
1346   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
1347   /* Create hash table to merge all columns for C(i, :) */
1348   PetscCall(PetscHSetICreate(&ht));
1349   PetscCall(PetscHSetICreate(&oht));
1350   PetscCall(PetscMalloc2(pn,&htd,pn,&hto));
1351   for (i=0; i<pn; i++) {
1352     PetscCall(PetscHSetICreate(&htd[i]));
1353     PetscCall(PetscHSetICreate(&hto[i]));
1354   }
1355 
1356   PetscCall(ISGetIndices(map,&mappingindices));
1357   ptap->c_rmti[0] = 0;
1358   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1359   for (i=0; i<am && (pon || pn); i++) {
1360     /* Form one row of AP */
1361     PetscCall(PetscHSetIClear(ht));
1362     PetscCall(PetscHSetIClear(oht));
1363     offset = i%dof;
1364     ii     = i/dof;
1365     /* If the off diag is empty, we should not do any calculation */
1366     nzi = po->i[ii+1] - po->i[ii];
1367     dnzi = pd->i[ii+1] - pd->i[ii];
1368     if (!nzi && !dnzi) continue;
1369 
1370     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
1371     PetscCall(PetscHSetIGetSize(ht,&htsize));
1372     PetscCall(PetscHSetIGetSize(oht,&htosize));
1373     /* If AP is empty, just continue */
1374     if (!(htsize+htosize)) continue;
1375 
1376     /* Form remote C(ii, :) */
1377     poj = po->j + po->i[ii];
1378     for (j=0; j<nzi; j++) {
1379       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
1380       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],oht));
1381     }
1382 
1383     /* Form local C(ii, :) */
1384     pdj = pd->j + pd->i[ii];
1385     for (j=0; j<dnzi; j++) {
1386       PetscCall(PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht));
1387       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
1388     }
1389   }
1390 
1391   PetscCall(ISRestoreIndices(map,&mappingindices));
1392 
1393   PetscCall(PetscHSetIDestroy(&ht));
1394   PetscCall(PetscHSetIDestroy(&oht));
1395 
1396   for (i=0; i<pon; i++) {
1397     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1398     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1399     c_rmtc[i] = htsize;
1400   }
1401 
1402   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
1403 
1404   for (i=0; i<pon; i++) {
1405     off = 0;
1406     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
1407     PetscCall(PetscHSetIDestroy(&hta[i]));
1408   }
1409   PetscCall(PetscFree(hta));
1410 
1411   PetscCall(PetscMalloc1(pon,&iremote));
1412   for (i=0; i<pon; i++) {
1413     owner = 0; lidx = 0;
1414     offset = i%dof;
1415     ii     = i/dof;
1416     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1417     iremote[i].index = lidx*dof+offset;
1418     iremote[i].rank  = owner;
1419   }
1420 
1421   PetscCall(PetscSFCreate(comm,&sf));
1422   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1423   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1424   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
1425   PetscCall(PetscSFSetFromOptions(sf));
1426   PetscCall(PetscSFSetUp(sf));
1427   /* How many neighbors have contributions to my rows? */
1428   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
1429   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
1430   rootspacesize = 0;
1431   for (i = 0; i < pn; i++) {
1432     rootspacesize += rootdegrees[i];
1433   }
1434   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
1435   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
1436   /* Get information from leaves
1437    * Number of columns other people contribute to my rows
1438    * */
1439   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
1440   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
1441   PetscCall(PetscFree(c_rmtc));
1442   PetscCall(PetscMalloc1(pn+1,&ptap->c_othi));
1443   /* The number of columns is received for each row */
1444   ptap->c_othi[0]     = 0;
1445   rootspacesize       = 0;
1446   rootspaceoffsets[0] = 0;
1447   for (i = 0; i < pn; i++) {
1448     rcvncols = 0;
1449     for (j = 0; j<rootdegrees[i]; j++) {
1450       rcvncols += rootspace[rootspacesize];
1451       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1452       rootspacesize++;
1453     }
1454     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1455   }
1456   PetscCall(PetscFree(rootspace));
1457 
1458   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
1459   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1460   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1461   PetscCall(PetscSFDestroy(&sf));
1462   PetscCall(PetscFree(rootspaceoffsets));
1463 
1464   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
1465   nleaves = 0;
1466   for (i = 0; i<pon; i++) {
1467     owner = 0;
1468     ii    = i/dof;
1469     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
1470     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1471     for (j=0; j<sendncols; j++) {
1472       iremote[nleaves].rank    = owner;
1473       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1474     }
1475   }
1476   PetscCall(PetscFree(c_rmtoffsets));
1477   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
1478 
1479   PetscCall(PetscSFCreate(comm,&ptap->sf));
1480   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1481   PetscCall(PetscSFSetFromOptions(ptap->sf));
1482   /* One to one map */
1483   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1484   /* Get remote data */
1485   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1486   PetscCall(PetscFree(c_rmtj));
1487   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
1488   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1489   pcstart *= dof;
1490   pcend   *= dof;
1491   for (i = 0; i < pn; i++) {
1492     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1493     rdj = c_othj + ptap->c_othi[i];
1494     for (j = 0; j < nzi; j++) {
1495       col =  rdj[j];
1496       /* diag part */
1497       if (col>=pcstart && col<pcend) {
1498         PetscCall(PetscHSetIAdd(htd[i],col));
1499       } else { /* off diag */
1500         PetscCall(PetscHSetIAdd(hto[i],col));
1501       }
1502     }
1503     PetscCall(PetscHSetIGetSize(htd[i],&htsize));
1504     dnz[i] = htsize;
1505     PetscCall(PetscHSetIDestroy(&htd[i]));
1506     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
1507     onz[i] = htsize;
1508     PetscCall(PetscHSetIDestroy(&hto[i]));
1509   }
1510 
1511   PetscCall(PetscFree2(htd,hto));
1512   PetscCall(PetscFree(c_othj));
1513 
1514   /* local sizes and preallocation */
1515   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1516   PetscCall(MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
1517   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1518   PetscCall(PetscFree2(dnz,onz));
1519 
1520   /* attach the supporting struct to Cmpi for reuse */
1521   Cmpi->product->data    = ptap;
1522   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1523   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1524 
1525   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1526   Cmpi->assembled = PETSC_FALSE;
1527   /* pick an algorithm */
1528   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1529   alg = 0;
1530   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1531   PetscOptionsEnd();
1532   switch (alg) {
1533     case 0:
1534       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1535       break;
1536     case 1:
1537       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1538       break;
1539     default:
1540       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1541   }
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1546 {
1547   PetscFunctionBegin;
1548   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C));
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1553 {
1554   Mat_APMPI           *ptap;
1555   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1556   MPI_Comm            comm;
1557   PetscMPIInt         size,rank;
1558   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1559   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1560   PetscInt            *lnk,i,k,pnz,row,nsend;
1561   PetscBT             lnkbt;
1562   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1563   PETSC_UNUSED PetscMPIInt icompleted=0;
1564   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1565   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1566   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1567   MPI_Request         *swaits,*rwaits;
1568   MPI_Status          *sstatus,rstatus;
1569   PetscLayout         rowmap;
1570   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1571   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1572   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1573   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1574   PetscScalar         *apv;
1575   PetscTable          ta;
1576   MatType             mtype;
1577   const char          *prefix;
1578 #if defined(PETSC_USE_INFO)
1579   PetscReal           apfill;
1580 #endif
1581 
1582   PetscFunctionBegin;
1583   MatCheckProduct(Cmpi,4);
1584   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1585   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1586   PetscCallMPI(MPI_Comm_size(comm,&size));
1587   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1588 
1589   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1590 
1591   /* create symbolic parallel matrix Cmpi */
1592   PetscCall(MatGetType(A,&mtype));
1593   PetscCall(MatSetType(Cmpi,mtype));
1594 
1595   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1596   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1597 
1598   /* create struct Mat_APMPI and attached it to C later */
1599   PetscCall(PetscNew(&ptap));
1600   ptap->reuse = MAT_INITIAL_MATRIX;
1601   ptap->algType = 1;
1602 
1603   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1604   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1605   /* get P_loc by taking all local rows of P */
1606   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc));
1607 
1608   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1609   /* --------------------------------- */
1610   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
1611   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
1612 
1613   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1614   /* ----------------------------------------------------------------- */
1615   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1616   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1617 
1618   /* create and initialize a linked list */
1619   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
1620   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1621   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1622   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
1623   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1624 
1625   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1626 
1627   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1628   if (ao) {
1629     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
1630   } else {
1631     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
1632   }
1633   current_space = free_space;
1634   nspacedouble  = 0;
1635 
1636   PetscCall(PetscMalloc1(am+1,&api));
1637   api[0] = 0;
1638   for (i=0; i<am; i++) {
1639     /* diagonal portion: Ad[i,:]*P */
1640     ai = ad->i; pi = p_loc->i;
1641     nzi = ai[i+1] - ai[i];
1642     aj  = ad->j + ai[i];
1643     for (j=0; j<nzi; j++) {
1644       row  = aj[j];
1645       pnz  = pi[row+1] - pi[row];
1646       Jptr = p_loc->j + pi[row];
1647       /* add non-zero cols of P into the sorted linked list lnk */
1648       PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1649     }
1650     /* off-diagonal portion: Ao[i,:]*P */
1651     if (ao) {
1652       ai = ao->i; pi = p_oth->i;
1653       nzi = ai[i+1] - ai[i];
1654       aj  = ao->j + ai[i];
1655       for (j=0; j<nzi; j++) {
1656         row  = aj[j];
1657         pnz  = pi[row+1] - pi[row];
1658         Jptr = p_oth->j + pi[row];
1659         PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1660       }
1661     }
1662     apnz     = lnk[0];
1663     api[i+1] = api[i] + apnz;
1664     if (ap_rmax < apnz) ap_rmax = apnz;
1665 
1666     /* if free space is not available, double the total space in the list */
1667     if (current_space->local_remaining<apnz) {
1668       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
1669       nspacedouble++;
1670     }
1671 
1672     /* Copy data into free space, then initialize lnk */
1673     PetscCall(PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt));
1674 
1675     current_space->array           += apnz;
1676     current_space->local_used      += apnz;
1677     current_space->local_remaining -= apnz;
1678   }
1679   /* Allocate space for apj and apv, initialize apj, and */
1680   /* destroy list of free space and other temporary array(s) */
1681   PetscCall(PetscMalloc2(api[am],&apj,api[am],&apv));
1682   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
1683   PetscCall(PetscLLDestroy(lnk,lnkbt));
1684 
1685   /* Create AP_loc for reuse */
1686   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
1687   PetscCall(MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name));
1688 #if defined(PETSC_USE_INFO)
1689   if (ao) {
1690     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1691   } else {
1692     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1693   }
1694   ptap->AP_loc->info.mallocs           = nspacedouble;
1695   ptap->AP_loc->info.fill_ratio_given  = fill;
1696   ptap->AP_loc->info.fill_ratio_needed = apfill;
1697 
1698   if (api[am]) {
1699     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
1700     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
1701   } else {
1702     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n"));
1703   }
1704 #endif
1705 
1706   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1707   /* ------------------------------------ */
1708   PetscCall(MatGetOptionsPrefix(A,&prefix));
1709   PetscCall(MatSetOptionsPrefix(ptap->Ro,prefix));
1710   PetscCall(MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_"));
1711   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
1712   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
1713   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
1714   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_"));
1715   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
1716   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"default"));
1717   PetscCall(MatProductSetFill(ptap->C_oth,fill));
1718   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1719   PetscCall(MatProductSymbolic(ptap->C_oth));
1720 
1721   /* (3) send coj of C_oth to other processors  */
1722   /* ------------------------------------------ */
1723   /* determine row ownership */
1724   PetscCall(PetscLayoutCreate(comm,&rowmap));
1725   rowmap->n  = pn;
1726   rowmap->bs = 1;
1727   PetscCall(PetscLayoutSetUp(rowmap));
1728   owners = rowmap->range;
1729 
1730   /* determine the number of messages to send, their lengths */
1731   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
1732   PetscCall(PetscArrayzero(len_s,size));
1733   PetscCall(PetscArrayzero(len_si,size));
1734 
1735   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1736   coi   = c_oth->i; coj = c_oth->j;
1737   con   = ptap->C_oth->rmap->n;
1738   proc  = 0;
1739   for (i=0; i<con; i++) {
1740     while (prmap[i] >= owners[proc+1]) proc++;
1741     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1742     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1743   }
1744 
1745   len          = 0; /* max length of buf_si[], see (4) */
1746   owners_co[0] = 0;
1747   nsend        = 0;
1748   for (proc=0; proc<size; proc++) {
1749     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1750     if (len_s[proc]) {
1751       nsend++;
1752       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1753       len         += len_si[proc];
1754     }
1755   }
1756 
1757   /* determine the number and length of messages to receive for coi and coj  */
1758   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
1759   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
1760 
1761   /* post the Irecv and Isend of coj */
1762   PetscCall(PetscCommGetNewTag(comm,&tagj));
1763   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
1764   PetscCall(PetscMalloc1(nsend+1,&swaits));
1765   for (proc=0, k=0; proc<size; proc++) {
1766     if (!len_s[proc]) continue;
1767     i    = owners_co[proc];
1768     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
1769     k++;
1770   }
1771 
1772   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1773   /* ---------------------------------------- */
1774   PetscCall(MatSetOptionsPrefix(ptap->Rd,prefix));
1775   PetscCall(MatAppendOptionsPrefix(ptap->Rd,"inner_diag_"));
1776   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
1777   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
1778   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
1779   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_"));
1780   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
1781   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
1782   PetscCall(MatProductSetFill(ptap->C_loc,fill));
1783   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1784   PetscCall(MatProductSymbolic(ptap->C_loc));
1785 
1786   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1787 
1788   /* receives coj are complete */
1789   for (i=0; i<nrecv; i++) {
1790     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
1791   }
1792   PetscCall(PetscFree(rwaits));
1793   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
1794 
1795   /* add received column indices into ta to update Crmax */
1796   for (k=0; k<nrecv; k++) {/* k-th received message */
1797     Jptr = buf_rj[k];
1798     for (j=0; j<len_r[k]; j++) {
1799       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
1800     }
1801   }
1802   PetscCall(PetscTableGetCount(ta,&Crmax));
1803   PetscCall(PetscTableDestroy(&ta));
1804 
1805   /* (4) send and recv coi */
1806   /*-----------------------*/
1807   PetscCall(PetscCommGetNewTag(comm,&tagi));
1808   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
1809   PetscCall(PetscMalloc1(len+1,&buf_s));
1810   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1811   for (proc=0,k=0; proc<size; proc++) {
1812     if (!len_s[proc]) continue;
1813     /* form outgoing message for i-structure:
1814          buf_si[0]:                 nrows to be sent
1815                [1:nrows]:           row index (global)
1816                [nrows+1:2*nrows+1]: i-structure index
1817     */
1818     /*-------------------------------------------*/
1819     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1820     buf_si_i    = buf_si + nrows+1;
1821     buf_si[0]   = nrows;
1822     buf_si_i[0] = 0;
1823     nrows       = 0;
1824     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1825       nzi = coi[i+1] - coi[i];
1826       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1827       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1828       nrows++;
1829     }
1830     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
1831     k++;
1832     buf_si += len_si[proc];
1833   }
1834   for (i=0; i<nrecv; i++) {
1835     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
1836   }
1837   PetscCall(PetscFree(rwaits));
1838   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
1839 
1840   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
1841   PetscCall(PetscFree(len_ri));
1842   PetscCall(PetscFree(swaits));
1843   PetscCall(PetscFree(buf_s));
1844 
1845   /* (5) compute the local portion of Cmpi      */
1846   /* ------------------------------------------ */
1847   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1848   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
1849   current_space = free_space;
1850 
1851   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
1852   for (k=0; k<nrecv; k++) {
1853     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1854     nrows       = *buf_ri_k[k];
1855     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1856     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1857   }
1858 
1859   MatPreallocateBegin(comm,pn,pn,dnz,onz);
1860   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1861   for (i=0; i<pn; i++) {
1862     /* add C_loc into Cmpi */
1863     nzi  = c_loc->i[i+1] - c_loc->i[i];
1864     Jptr = c_loc->j + c_loc->i[i];
1865     PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
1866 
1867     /* add received col data into lnk */
1868     for (k=0; k<nrecv; k++) { /* k-th received message */
1869       if (i == *nextrow[k]) { /* i-th row */
1870         nzi  = *(nextci[k]+1) - *nextci[k];
1871         Jptr = buf_rj[k] + *nextci[k];
1872         PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
1873         nextrow[k]++; nextci[k]++;
1874       }
1875     }
1876     nzi = lnk[0];
1877 
1878     /* copy data into free space, then initialize lnk */
1879     PetscCall(PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt));
1880     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
1881   }
1882   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
1883   PetscCall(PetscLLDestroy(lnk,lnkbt));
1884   PetscCall(PetscFreeSpaceDestroy(free_space));
1885 
1886   /* local sizes and preallocation */
1887   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1888   if (P->cmap->bs > 0) {
1889     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
1890     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
1891   }
1892   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1893   MatPreallocateEnd(dnz,onz);
1894 
1895   /* members in merge */
1896   PetscCall(PetscFree(id_r));
1897   PetscCall(PetscFree(len_r));
1898   PetscCall(PetscFree(buf_ri[0]));
1899   PetscCall(PetscFree(buf_ri));
1900   PetscCall(PetscFree(buf_rj[0]));
1901   PetscCall(PetscFree(buf_rj));
1902   PetscCall(PetscLayoutDestroy(&rowmap));
1903 
1904   PetscCall(PetscCalloc1(pN,&ptap->apa));
1905 
1906   /* attach the supporting struct to Cmpi for reuse */
1907   Cmpi->product->data    = ptap;
1908   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1909   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1910 
1911   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1912   Cmpi->assembled = PETSC_FALSE;
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1917 {
1918   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1919   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1920   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1921   Mat_APMPI         *ptap;
1922   Mat               AP_loc,C_loc,C_oth;
1923   PetscInt          i,rstart,rend,cm,ncols,row;
1924   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1925   PetscScalar       *apa;
1926   const PetscInt    *cols;
1927   const PetscScalar *vals;
1928 
1929   PetscFunctionBegin;
1930   MatCheckProduct(C,3);
1931   ptap = (Mat_APMPI*)C->product->data;
1932   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
1933   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
1934 
1935   PetscCall(MatZeroEntries(C));
1936   /* 1) get R = Pd^T,Ro = Po^T */
1937   if (ptap->reuse == MAT_REUSE_MATRIX) {
1938     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
1939     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
1940   }
1941 
1942   /* 2) get AP_loc */
1943   AP_loc = ptap->AP_loc;
1944   ap = (Mat_SeqAIJ*)AP_loc->data;
1945 
1946   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1947   /*-----------------------------------------------------*/
1948   if (ptap->reuse == MAT_REUSE_MATRIX) {
1949     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1950     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1951     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
1952   }
1953 
1954   /* 2-2) compute numeric A_loc*P - dominating part */
1955   /* ---------------------------------------------- */
1956   /* get data from symbolic products */
1957   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1958   if (ptap->P_oth) {
1959     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1960   }
1961   apa   = ptap->apa;
1962   api   = ap->i;
1963   apj   = ap->j;
1964   for (i=0; i<am; i++) {
1965     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1966     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1967     apnz = api[i+1] - api[i];
1968     for (j=0; j<apnz; j++) {
1969       col = apj[j+api[i]];
1970       ap->a[j+ap->i[i]] = apa[col];
1971       apa[col] = 0.0;
1972     }
1973   }
1974   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1975   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1976 
1977   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1978   PetscCall(MatProductNumeric(ptap->C_loc));
1979   PetscCall(MatProductNumeric(ptap->C_oth));
1980   C_loc = ptap->C_loc;
1981   C_oth = ptap->C_oth;
1982 
1983   /* add C_loc and Co to to C */
1984   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
1985 
1986   /* C_loc -> C */
1987   cm    = C_loc->rmap->N;
1988   c_seq = (Mat_SeqAIJ*)C_loc->data;
1989   cols = c_seq->j;
1990   vals = c_seq->a;
1991 
1992   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1993   /* when there are no off-processor parts.  */
1994   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1995   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1996   /* a table, and other, more complex stuff has to be done. */
1997   if (C->assembled) {
1998     C->was_assembled = PETSC_TRUE;
1999     C->assembled     = PETSC_FALSE;
2000   }
2001   if (C->was_assembled) {
2002     for (i=0; i<cm; i++) {
2003       ncols = c_seq->i[i+1] - c_seq->i[i];
2004       row = rstart + i;
2005       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
2006       cols += ncols; vals += ncols;
2007     }
2008   } else {
2009     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
2010   }
2011 
2012   /* Co -> C, off-processor part */
2013   cm = C_oth->rmap->N;
2014   c_seq = (Mat_SeqAIJ*)C_oth->data;
2015   cols = c_seq->j;
2016   vals = c_seq->a;
2017   for (i=0; i<cm; i++) {
2018     ncols = c_seq->i[i+1] - c_seq->i[i];
2019     row = p->garray[i];
2020     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
2021     cols += ncols; vals += ncols;
2022   }
2023 
2024   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
2025   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
2026 
2027   ptap->reuse = MAT_REUSE_MATRIX;
2028   PetscFunctionReturn(0);
2029 }
2030 
2031 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2032 {
2033   Mat_Product         *product = C->product;
2034   Mat                 A=product->A,P=product->B;
2035   MatProductAlgorithm alg=product->alg;
2036   PetscReal           fill=product->fill;
2037   PetscBool           flg;
2038 
2039   PetscFunctionBegin;
2040   /* scalable: do R=P^T locally, then C=R*A*P */
2041   PetscCall(PetscStrcmp(alg,"scalable",&flg));
2042   if (flg) {
2043     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C));
2044     C->ops->productnumeric = MatProductNumeric_PtAP;
2045     goto next;
2046   }
2047 
2048   /* nonscalable: do R=P^T locally, then C=R*A*P */
2049   PetscCall(PetscStrcmp(alg,"nonscalable",&flg));
2050   if (flg) {
2051     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C));
2052     goto next;
2053   }
2054 
2055   /* allatonce */
2056   PetscCall(PetscStrcmp(alg,"allatonce",&flg));
2057   if (flg) {
2058     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C));
2059     goto next;
2060   }
2061 
2062   /* allatonce_merged */
2063   PetscCall(PetscStrcmp(alg,"allatonce_merged",&flg));
2064   if (flg) {
2065     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C));
2066     goto next;
2067   }
2068 
2069   /* backend general code */
2070   PetscCall(PetscStrcmp(alg,"backend",&flg));
2071   if (flg) {
2072     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2073     PetscFunctionReturn(0);
2074   }
2075 
2076   /* hypre */
2077 #if defined(PETSC_HAVE_HYPRE)
2078   PetscCall(PetscStrcmp(alg,"hypre",&flg));
2079   if (flg) {
2080     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C));
2081     C->ops->productnumeric = MatProductNumeric_PtAP;
2082     PetscFunctionReturn(0);
2083   }
2084 #endif
2085   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
2086 
2087 next:
2088   C->ops->productnumeric = MatProductNumeric_PtAP;
2089   PetscFunctionReturn(0);
2090 }
2091