xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) {
17   PetscBool         iascii;
18   PetscViewerFormat format;
19   Mat_APMPI        *ptap;
20 
21   PetscFunctionBegin;
22   MatCheckProduct(A, 1);
23   ptap = (Mat_APMPI *)A->product->data;
24   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
25   if (iascii) {
26     PetscCall(PetscViewerGetFormat(viewer, &format));
27     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28       if (ptap->algType == 0) {
29         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
30       } else if (ptap->algType == 1) {
31         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
32       } else if (ptap->algType == 2) {
33         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
34       } else if (ptap->algType == 3) {
35         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
36       }
37     }
38   }
39   PetscFunctionReturn(0);
40 }
41 
42 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) {
43   Mat_APMPI           *ptap = (Mat_APMPI *)data;
44   Mat_Merge_SeqsToMPI *merge;
45 
46   PetscFunctionBegin;
47   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
48   PetscCall(PetscFree(ptap->bufa));
49   PetscCall(MatDestroy(&ptap->P_loc));
50   PetscCall(MatDestroy(&ptap->P_oth));
51   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
52   PetscCall(MatDestroy(&ptap->Rd));
53   PetscCall(MatDestroy(&ptap->Ro));
54   if (ptap->AP_loc) { /* used by alg_rap */
55     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
56     PetscCall(PetscFree(ap->i));
57     PetscCall(PetscFree2(ap->j, ap->a));
58     PetscCall(MatDestroy(&ptap->AP_loc));
59   } else { /* used by alg_ptap */
60     PetscCall(PetscFree(ptap->api));
61     PetscCall(PetscFree(ptap->apj));
62   }
63   PetscCall(MatDestroy(&ptap->C_loc));
64   PetscCall(MatDestroy(&ptap->C_oth));
65   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
66 
67   PetscCall(MatDestroy(&ptap->Pt));
68 
69   merge = ptap->merge;
70   if (merge) { /* used by alg_ptap */
71     PetscCall(PetscFree(merge->id_r));
72     PetscCall(PetscFree(merge->len_s));
73     PetscCall(PetscFree(merge->len_r));
74     PetscCall(PetscFree(merge->bi));
75     PetscCall(PetscFree(merge->bj));
76     PetscCall(PetscFree(merge->buf_ri[0]));
77     PetscCall(PetscFree(merge->buf_ri));
78     PetscCall(PetscFree(merge->buf_rj[0]));
79     PetscCall(PetscFree(merge->buf_rj));
80     PetscCall(PetscFree(merge->coi));
81     PetscCall(PetscFree(merge->coj));
82     PetscCall(PetscFree(merge->owners_co));
83     PetscCall(PetscLayoutDestroy(&merge->rowmap));
84     PetscCall(PetscFree(ptap->merge));
85   }
86   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
87 
88   PetscCall(PetscSFDestroy(&ptap->sf));
89   PetscCall(PetscFree(ptap->c_othi));
90   PetscCall(PetscFree(ptap->c_rmti));
91   PetscCall(PetscFree(ptap));
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) {
96   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
97   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
98   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
99   Mat_APMPI         *ptap;
100   Mat                AP_loc, C_loc, C_oth;
101   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
102   PetscScalar       *apa;
103   const PetscInt    *cols;
104   const PetscScalar *vals;
105 
106   PetscFunctionBegin;
107   MatCheckProduct(C, 3);
108   ptap = (Mat_APMPI *)C->product->data;
109   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
110   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
111 
112   PetscCall(MatZeroEntries(C));
113 
114   /* 1) get R = Pd^T,Ro = Po^T */
115   if (ptap->reuse == MAT_REUSE_MATRIX) {
116     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
117     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
118   }
119 
120   /* 2) get AP_loc */
121   AP_loc = ptap->AP_loc;
122   ap     = (Mat_SeqAIJ *)AP_loc->data;
123 
124   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
125   /*-----------------------------------------------------*/
126   if (ptap->reuse == MAT_REUSE_MATRIX) {
127     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
128     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
129     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
130   }
131 
132   /* 2-2) compute numeric A_loc*P - dominating part */
133   /* ---------------------------------------------- */
134   /* get data from symbolic products */
135   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
136   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
137 
138   api = ap->i;
139   apj = ap->j;
140   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
141   for (i = 0; i < am; i++) {
142     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143     apnz = api[i + 1] - api[i];
144     apa  = ap->a + api[i];
145     PetscCall(PetscArrayzero(apa, apnz));
146     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147   }
148   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
149   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
150 
151   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152   /* Always use scalable version since we are in the MPI scalable version */
153   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
154   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
155 
156   C_loc = ptap->C_loc;
157   C_oth = ptap->C_oth;
158 
159   /* add C_loc and Co to to C */
160   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
161 
162   /* C_loc -> C */
163   cm    = C_loc->rmap->N;
164   c_seq = (Mat_SeqAIJ *)C_loc->data;
165   cols  = c_seq->j;
166   vals  = c_seq->a;
167   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
168 
169   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170   /* when there are no off-processor parts.  */
171   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173   /* a table, and other, more complex stuff has to be done. */
174   if (C->assembled) {
175     C->was_assembled = PETSC_TRUE;
176     C->assembled     = PETSC_FALSE;
177   }
178   if (C->was_assembled) {
179     for (i = 0; i < cm; i++) {
180       ncols = c_seq->i[i + 1] - c_seq->i[i];
181       row   = rstart + i;
182       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
183       cols += ncols;
184       vals += ncols;
185     }
186   } else {
187     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
188   }
189   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
190   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
191 
192   /* Co -> C, off-processor part */
193   cm    = C_oth->rmap->N;
194   c_seq = (Mat_SeqAIJ *)C_oth->data;
195   cols  = c_seq->j;
196   vals  = c_seq->a;
197   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
198   for (i = 0; i < cm; i++) {
199     ncols = c_seq->i[i + 1] - c_seq->i[i];
200     row   = p->garray[i];
201     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
202     cols += ncols;
203     vals += ncols;
204   }
205   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
206   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
207 
208   ptap->reuse = MAT_REUSE_MATRIX;
209 
210   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
211   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) {
216   Mat_APMPI               *ptap;
217   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
218   MPI_Comm                 comm;
219   PetscMPIInt              size, rank;
220   Mat                      P_loc, P_oth;
221   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
222   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
223   PetscInt                *lnk, i, k, pnz, row, nsend;
224   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
225   PETSC_UNUSED PetscMPIInt icompleted = 0;
226   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
227   const PetscInt          *owners;
228   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
229   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
230   MPI_Request             *swaits, *rwaits;
231   MPI_Status              *sstatus, rstatus;
232   PetscLayout              rowmap;
233   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
234   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
235   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
236   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
237   PetscScalar             *apv;
238   PetscTable               ta;
239   MatType                  mtype;
240   const char              *prefix;
241 #if defined(PETSC_USE_INFO)
242   PetscReal apfill;
243 #endif
244 
245   PetscFunctionBegin;
246   MatCheckProduct(Cmpi, 4);
247   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
248   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
249   PetscCallMPI(MPI_Comm_size(comm, &size));
250   PetscCallMPI(MPI_Comm_rank(comm, &rank));
251 
252   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
253 
254   /* create symbolic parallel matrix Cmpi */
255   PetscCall(MatGetType(A, &mtype));
256   PetscCall(MatSetType(Cmpi, mtype));
257 
258   /* create struct Mat_APMPI and attached it to C later */
259   PetscCall(PetscNew(&ptap));
260   ptap->reuse   = MAT_INITIAL_MATRIX;
261   ptap->algType = 0;
262 
263   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
264   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
265   /* get P_loc by taking all local rows of P */
266   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
267 
268   ptap->P_loc = P_loc;
269   ptap->P_oth = P_oth;
270 
271   /* (0) compute Rd = Pd^T, Ro = Po^T  */
272   /* --------------------------------- */
273   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
274   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
275 
276   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277   /* ----------------------------------------------------------------- */
278   p_loc = (Mat_SeqAIJ *)P_loc->data;
279   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
280 
281   /* create and initialize a linked list */
282   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
283   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
286 
287   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
288 
289   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290   if (ao) {
291     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
292   } else {
293     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
294   }
295   current_space = free_space;
296   nspacedouble  = 0;
297 
298   PetscCall(PetscMalloc1(am + 1, &api));
299   api[0] = 0;
300   for (i = 0; i < am; i++) {
301     /* diagonal portion: Ad[i,:]*P */
302     ai  = ad->i;
303     pi  = p_loc->i;
304     nzi = ai[i + 1] - ai[i];
305     aj  = ad->j + ai[i];
306     for (j = 0; j < nzi; j++) {
307       row  = aj[j];
308       pnz  = pi[row + 1] - pi[row];
309       Jptr = p_loc->j + pi[row];
310       /* add non-zero cols of P into the sorted linked list lnk */
311       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312     }
313     /* off-diagonal portion: Ao[i,:]*P */
314     if (ao) {
315       ai  = ao->i;
316       pi  = p_oth->i;
317       nzi = ai[i + 1] - ai[i];
318       aj  = ao->j + ai[i];
319       for (j = 0; j < nzi; j++) {
320         row  = aj[j];
321         pnz  = pi[row + 1] - pi[row];
322         Jptr = p_oth->j + pi[row];
323         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324       }
325     }
326     apnz       = lnk[0];
327     api[i + 1] = api[i] + apnz;
328 
329     /* if free space is not available, double the total space in the list */
330     if (current_space->local_remaining < apnz) {
331       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
332       nspacedouble++;
333     }
334 
335     /* Copy data into free space, then initialize lnk */
336     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
337 
338     current_space->array += apnz;
339     current_space->local_used += apnz;
340     current_space->local_remaining -= apnz;
341   }
342   /* Allocate space for apj and apv, initialize apj, and */
343   /* destroy list of free space and other temporary array(s) */
344   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
345   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
346   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347 
348   /* Create AP_loc for reuse */
349   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
350   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351 
352 #if defined(PETSC_USE_INFO)
353   if (ao) {
354     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355   } else {
356     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357   }
358   ptap->AP_loc->info.mallocs           = nspacedouble;
359   ptap->AP_loc->info.fill_ratio_given  = fill;
360   ptap->AP_loc->info.fill_ratio_needed = apfill;
361 
362   if (api[am]) {
363     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
364     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365   } else {
366     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367   }
368 #endif
369 
370   /* (2-1) compute symbolic Co = Ro*AP_loc  */
371   /* -------------------------------------- */
372   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
373   PetscCall(MatGetOptionsPrefix(A, &prefix));
374   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
375   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
376 
377   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
378   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
379   PetscCall(MatProductSetFill(ptap->C_oth, fill));
380   PetscCall(MatProductSetFromOptions(ptap->C_oth));
381   PetscCall(MatProductSymbolic(ptap->C_oth));
382 
383   /* (3) send coj of C_oth to other processors  */
384   /* ------------------------------------------ */
385   /* determine row ownership */
386   PetscCall(PetscLayoutCreate(comm, &rowmap));
387   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
388   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
389   PetscCall(PetscLayoutSetUp(rowmap));
390   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
391 
392   /* determine the number of messages to send, their lengths */
393   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
394   PetscCall(PetscArrayzero(len_s, size));
395   PetscCall(PetscArrayzero(len_si, size));
396 
397   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
398   coi   = c_oth->i;
399   coj   = c_oth->j;
400   con   = ptap->C_oth->rmap->n;
401   proc  = 0;
402   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
403   for (i = 0; i < con; i++) {
404     while (prmap[i] >= owners[proc + 1]) proc++;
405     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407   }
408 
409   len          = 0; /* max length of buf_si[], see (4) */
410   owners_co[0] = 0;
411   nsend        = 0;
412   for (proc = 0; proc < size; proc++) {
413     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
414     if (len_s[proc]) {
415       nsend++;
416       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417       len += len_si[proc];
418     }
419   }
420 
421   /* determine the number and length of messages to receive for coi and coj  */
422   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
423   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
424 
425   /* post the Irecv and Isend of coj */
426   PetscCall(PetscCommGetNewTag(comm, &tagj));
427   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
428   PetscCall(PetscMalloc1(nsend + 1, &swaits));
429   for (proc = 0, k = 0; proc < size; proc++) {
430     if (!len_s[proc]) continue;
431     i = owners_co[proc];
432     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
433     k++;
434   }
435 
436   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437   /* ---------------------------------------- */
438   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
439   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
440   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
441   PetscCall(MatProductSetFill(ptap->C_loc, fill));
442 
443   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
444   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
445 
446   PetscCall(MatProductSetFromOptions(ptap->C_loc));
447   PetscCall(MatProductSymbolic(ptap->C_loc));
448 
449   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
450   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
451 
452   /* receives coj are complete */
453   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
454   PetscCall(PetscFree(rwaits));
455   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
456 
457   /* add received column indices into ta to update Crmax */
458   for (k = 0; k < nrecv; k++) { /* k-th received message */
459     Jptr = buf_rj[k];
460     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
461   }
462   PetscCall(PetscTableGetCount(ta, &Crmax));
463   PetscCall(PetscTableDestroy(&ta));
464 
465   /* (4) send and recv coi */
466   /*-----------------------*/
467   PetscCall(PetscCommGetNewTag(comm, &tagi));
468   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
469   PetscCall(PetscMalloc1(len + 1, &buf_s));
470   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
471   for (proc = 0, k = 0; proc < size; proc++) {
472     if (!len_s[proc]) continue;
473     /* form outgoing message for i-structure:
474          buf_si[0]:                 nrows to be sent
475                [1:nrows]:           row index (global)
476                [nrows+1:2*nrows+1]: i-structure index
477     */
478     /*-------------------------------------------*/
479     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
480     buf_si_i    = buf_si + nrows + 1;
481     buf_si[0]   = nrows;
482     buf_si_i[0] = 0;
483     nrows       = 0;
484     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
485       nzi                 = coi[i + 1] - coi[i];
486       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
487       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
488       nrows++;
489     }
490     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
491     k++;
492     buf_si += len_si[proc];
493   }
494   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
495   PetscCall(PetscFree(rwaits));
496   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
497 
498   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
499   PetscCall(PetscFree(len_ri));
500   PetscCall(PetscFree(swaits));
501   PetscCall(PetscFree(buf_s));
502 
503   /* (5) compute the local portion of Cmpi      */
504   /* ------------------------------------------ */
505   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
506   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
507   current_space = free_space;
508 
509   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
510   for (k = 0; k < nrecv; k++) {
511     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
512     nrows       = *buf_ri_k[k];
513     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
514     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
515   }
516 
517   MatPreallocateBegin(comm, pn, pn, dnz, onz);
518   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
519   for (i = 0; i < pn; i++) {
520     /* add C_loc into Cmpi */
521     nzi  = c_loc->i[i + 1] - c_loc->i[i];
522     Jptr = c_loc->j + c_loc->i[i];
523     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
524 
525     /* add received col data into lnk */
526     for (k = 0; k < nrecv; k++) { /* k-th received message */
527       if (i == *nextrow[k]) {     /* i-th row */
528         nzi  = *(nextci[k] + 1) - *nextci[k];
529         Jptr = buf_rj[k] + *nextci[k];
530         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
531         nextrow[k]++;
532         nextci[k]++;
533       }
534     }
535     nzi = lnk[0];
536 
537     /* copy data into free space, then initialize lnk */
538     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
539     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
540   }
541   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
542   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
543   PetscCall(PetscFreeSpaceDestroy(free_space));
544 
545   /* local sizes and preallocation */
546   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
547   if (P->cmap->bs > 0) {
548     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
549     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
550   }
551   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
552   MatPreallocateEnd(dnz, onz);
553 
554   /* members in merge */
555   PetscCall(PetscFree(id_r));
556   PetscCall(PetscFree(len_r));
557   PetscCall(PetscFree(buf_ri[0]));
558   PetscCall(PetscFree(buf_ri));
559   PetscCall(PetscFree(buf_rj[0]));
560   PetscCall(PetscFree(buf_rj));
561   PetscCall(PetscLayoutDestroy(&rowmap));
562 
563   nout = 0;
564   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
565   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
566   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
567   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
568 
569   /* attach the supporting struct to Cmpi for reuse */
570   Cmpi->product->data    = ptap;
571   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
572   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
573 
574   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
575   Cmpi->assembled        = PETSC_FALSE;
576   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
577   PetscFunctionReturn(0);
578 }
579 
580 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) {
581   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
582   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
583   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
584   PetscInt    pcstart, pcend, column, offset;
585 
586   PetscFunctionBegin;
587   pcstart = P->cmap->rstart;
588   pcstart *= dof;
589   pcend = P->cmap->rend;
590   pcend *= dof;
591   /* diagonal portion: Ad[i,:]*P */
592   ai  = ad->i;
593   nzi = ai[i + 1] - ai[i];
594   aj  = ad->j + ai[i];
595   for (j = 0; j < nzi; j++) {
596     row    = aj[j];
597     offset = row % dof;
598     row /= dof;
599     nzpi = pd->i[row + 1] - pd->i[row];
600     pj   = pd->j + pd->i[row];
601     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
602   }
603   /* off diag P */
604   for (j = 0; j < nzi; j++) {
605     row    = aj[j];
606     offset = row % dof;
607     row /= dof;
608     nzpi = po->i[row + 1] - po->i[row];
609     pj   = po->j + po->i[row];
610     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
611   }
612 
613   /* off diagonal part: Ao[i, :]*P_oth */
614   if (ao) {
615     ai  = ao->i;
616     pi  = p_oth->i;
617     nzi = ai[i + 1] - ai[i];
618     aj  = ao->j + ai[i];
619     for (j = 0; j < nzi; j++) {
620       row       = aj[j];
621       offset    = a->garray[row] % dof;
622       row       = map[row];
623       pnz       = pi[row + 1] - pi[row];
624       p_othcols = p_oth->j + pi[row];
625       for (col = 0; col < pnz; col++) {
626         column = p_othcols[col] * dof + offset;
627         if (column >= pcstart && column < pcend) {
628           PetscCall(PetscHSetIAdd(dht, column));
629         } else {
630           PetscCall(PetscHSetIAdd(oht, column));
631         }
632       }
633     }
634   } /* end if (ao) */
635   PetscFunctionReturn(0);
636 }
637 
638 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) {
639   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
640   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
641   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
642   PetscScalar ra, *aa, *pa;
643 
644   PetscFunctionBegin;
645   pcstart = P->cmap->rstart;
646   pcstart *= dof;
647 
648   /* diagonal portion: Ad[i,:]*P */
649   ai  = ad->i;
650   nzi = ai[i + 1] - ai[i];
651   aj  = ad->j + ai[i];
652   aa  = ad->a + ai[i];
653   for (j = 0; j < nzi; j++) {
654     ra     = aa[j];
655     row    = aj[j];
656     offset = row % dof;
657     row /= dof;
658     nzpi = pd->i[row + 1] - pd->i[row];
659     pj   = pd->j + pd->i[row];
660     pa   = pd->a + pd->i[row];
661     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
662     PetscCall(PetscLogFlops(2.0 * nzpi));
663   }
664   for (j = 0; j < nzi; j++) {
665     ra     = aa[j];
666     row    = aj[j];
667     offset = row % dof;
668     row /= dof;
669     nzpi = po->i[row + 1] - po->i[row];
670     pj   = po->j + po->i[row];
671     pa   = po->a + po->i[row];
672     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
673     PetscCall(PetscLogFlops(2.0 * nzpi));
674   }
675 
676   /* off diagonal part: Ao[i, :]*P_oth */
677   if (ao) {
678     ai  = ao->i;
679     pi  = p_oth->i;
680     nzi = ai[i + 1] - ai[i];
681     aj  = ao->j + ai[i];
682     aa  = ao->a + ai[i];
683     for (j = 0; j < nzi; j++) {
684       row       = aj[j];
685       offset    = a->garray[row] % dof;
686       row       = map[row];
687       ra        = aa[j];
688       pnz       = pi[row + 1] - pi[row];
689       p_othcols = p_oth->j + pi[row];
690       pa        = p_oth->a + pi[row];
691       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
692       PetscCall(PetscLogFlops(2.0 * pnz));
693     }
694   } /* end if (ao) */
695 
696   PetscFunctionReturn(0);
697 }
698 
699 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
700 
701 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) {
702   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
703   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
704   Mat_APMPI      *ptap;
705   PetscHMapIV     hmap;
706   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
707   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
708   PetscInt        offset, ii, pocol;
709   const PetscInt *mappingindices;
710   IS              map;
711 
712   PetscFunctionBegin;
713   MatCheckProduct(C, 4);
714   ptap = (Mat_APMPI *)C->product->data;
715   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
716   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
717 
718   PetscCall(MatZeroEntries(C));
719 
720   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
721   /*-----------------------------------------------------*/
722   if (ptap->reuse == MAT_REUSE_MATRIX) {
723     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
724     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
725   }
726   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
727 
728   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
729   pon *= dof;
730   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
731   PetscCall(MatGetLocalSize(A, &am, NULL));
732   cmaxr = 0;
733   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
734   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
735   PetscCall(PetscHMapIVCreate(&hmap));
736   PetscCall(PetscHMapIVResize(hmap, cmaxr));
737   PetscCall(ISGetIndices(map, &mappingindices));
738   for (i = 0; i < am && pon; i++) {
739     PetscCall(PetscHMapIVClear(hmap));
740     offset = i % dof;
741     ii     = i / dof;
742     nzi    = po->i[ii + 1] - po->i[ii];
743     if (!nzi) continue;
744     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
745     voff = 0;
746     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
747     if (!voff) continue;
748 
749     /* Form C(ii, :) */
750     poj = po->j + po->i[ii];
751     poa = po->a + po->i[ii];
752     for (j = 0; j < nzi; j++) {
753       pocol   = poj[j] * dof + offset;
754       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
755       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
756       for (jj = 0; jj < voff; jj++) {
757         apvaluestmp[jj] = apvalues[jj] * poa[j];
758         /*If the row is empty */
759         if (!c_rmtc[pocol]) {
760           c_rmtjj[jj] = apindices[jj];
761           c_rmtaa[jj] = apvaluestmp[jj];
762           c_rmtc[pocol]++;
763         } else {
764           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
765           if (loc >= 0) { /* hit */
766             c_rmtaa[loc] += apvaluestmp[jj];
767             PetscCall(PetscLogFlops(1.0));
768           } else { /* new element */
769             loc = -(loc + 1);
770             /* Move data backward */
771             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
772               c_rmtjj[kk] = c_rmtjj[kk - 1];
773               c_rmtaa[kk] = c_rmtaa[kk - 1];
774             } /* End kk */
775             c_rmtjj[loc] = apindices[jj];
776             c_rmtaa[loc] = apvaluestmp[jj];
777             c_rmtc[pocol]++;
778           }
779         }
780         PetscCall(PetscLogFlops(voff));
781       } /* End jj */
782     }   /* End j */
783   }     /* End i */
784 
785   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
786   PetscCall(PetscHMapIVDestroy(&hmap));
787 
788   PetscCall(MatGetLocalSize(P, NULL, &pn));
789   pn *= dof;
790   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
791 
792   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
793   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
794   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
795   pcstart = pcstart * dof;
796   pcend   = pcend * dof;
797   cd      = (Mat_SeqAIJ *)(c->A)->data;
798   co      = (Mat_SeqAIJ *)(c->B)->data;
799 
800   cmaxr = 0;
801   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
802   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
803   PetscCall(PetscHMapIVCreate(&hmap));
804   PetscCall(PetscHMapIVResize(hmap, cmaxr));
805   for (i = 0; i < am && pn; i++) {
806     PetscCall(PetscHMapIVClear(hmap));
807     offset = i % dof;
808     ii     = i / dof;
809     nzi    = pd->i[ii + 1] - pd->i[ii];
810     if (!nzi) continue;
811     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
812     voff = 0;
813     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
814     if (!voff) continue;
815     /* Form C(ii, :) */
816     pdj = pd->j + pd->i[ii];
817     pda = pd->a + pd->i[ii];
818     for (j = 0; j < nzi; j++) {
819       row = pcstart + pdj[j] * dof + offset;
820       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
821       PetscCall(PetscLogFlops(voff));
822       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
823     }
824   }
825   PetscCall(ISRestoreIndices(map, &mappingindices));
826   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
827   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
828   PetscCall(PetscHMapIVDestroy(&hmap));
829   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
830   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
831   PetscCall(PetscFree2(c_rmtj, c_rmta));
832 
833   /* Add contributions from remote */
834   for (i = 0; i < pn; i++) {
835     row = i + pcstart;
836     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
837   }
838   PetscCall(PetscFree2(c_othj, c_otha));
839 
840   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
841   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
842 
843   ptap->reuse = MAT_REUSE_MATRIX;
844   PetscFunctionReturn(0);
845 }
846 
847 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) {
848   PetscFunctionBegin;
849 
850   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
851   PetscFunctionReturn(0);
852 }
853 
854 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) {
855   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
856   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
857   Mat_APMPI      *ptap;
858   PetscHMapIV     hmap;
859   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
860   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
861   PetscInt        offset, ii, pocol;
862   const PetscInt *mappingindices;
863   IS              map;
864 
865   PetscFunctionBegin;
866   MatCheckProduct(C, 4);
867   ptap = (Mat_APMPI *)C->product->data;
868   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
869   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
870 
871   PetscCall(MatZeroEntries(C));
872 
873   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
874   /*-----------------------------------------------------*/
875   if (ptap->reuse == MAT_REUSE_MATRIX) {
876     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
877     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
878   }
879   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
880   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
881   pon *= dof;
882   PetscCall(MatGetLocalSize(P, NULL, &pn));
883   pn *= dof;
884 
885   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
886   PetscCall(MatGetLocalSize(A, &am, NULL));
887   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
888   pcstart *= dof;
889   pcend *= dof;
890   cmaxr = 0;
891   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
892   cd = (Mat_SeqAIJ *)(c->A)->data;
893   co = (Mat_SeqAIJ *)(c->B)->data;
894   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
895   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
896   PetscCall(PetscHMapIVCreate(&hmap));
897   PetscCall(PetscHMapIVResize(hmap, cmaxr));
898   PetscCall(ISGetIndices(map, &mappingindices));
899   for (i = 0; i < am && (pon || pn); i++) {
900     PetscCall(PetscHMapIVClear(hmap));
901     offset = i % dof;
902     ii     = i / dof;
903     nzi    = po->i[ii + 1] - po->i[ii];
904     dnzi   = pd->i[ii + 1] - pd->i[ii];
905     if (!nzi && !dnzi) continue;
906     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
907     voff = 0;
908     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
909     if (!voff) continue;
910 
911     /* Form remote C(ii, :) */
912     poj = po->j + po->i[ii];
913     poa = po->a + po->i[ii];
914     for (j = 0; j < nzi; j++) {
915       pocol   = poj[j] * dof + offset;
916       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
917       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
918       for (jj = 0; jj < voff; jj++) {
919         apvaluestmp[jj] = apvalues[jj] * poa[j];
920         /*If the row is empty */
921         if (!c_rmtc[pocol]) {
922           c_rmtjj[jj] = apindices[jj];
923           c_rmtaa[jj] = apvaluestmp[jj];
924           c_rmtc[pocol]++;
925         } else {
926           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
927           if (loc >= 0) { /* hit */
928             c_rmtaa[loc] += apvaluestmp[jj];
929             PetscCall(PetscLogFlops(1.0));
930           } else { /* new element */
931             loc = -(loc + 1);
932             /* Move data backward */
933             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
934               c_rmtjj[kk] = c_rmtjj[kk - 1];
935               c_rmtaa[kk] = c_rmtaa[kk - 1];
936             } /* End kk */
937             c_rmtjj[loc] = apindices[jj];
938             c_rmtaa[loc] = apvaluestmp[jj];
939             c_rmtc[pocol]++;
940           }
941         }
942       } /* End jj */
943       PetscCall(PetscLogFlops(voff));
944     } /* End j */
945 
946     /* Form local C(ii, :) */
947     pdj = pd->j + pd->i[ii];
948     pda = pd->a + pd->i[ii];
949     for (j = 0; j < dnzi; j++) {
950       row = pcstart + pdj[j] * dof + offset;
951       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
952       PetscCall(PetscLogFlops(voff));
953       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
954     } /* End j */
955   }   /* End i */
956 
957   PetscCall(ISRestoreIndices(map, &mappingindices));
958   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
959   PetscCall(PetscHMapIVDestroy(&hmap));
960   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
961 
962   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
963   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
964   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
965   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
966   PetscCall(PetscFree2(c_rmtj, c_rmta));
967 
968   /* Add contributions from remote */
969   for (i = 0; i < pn; i++) {
970     row = i + pcstart;
971     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
972   }
973   PetscCall(PetscFree2(c_othj, c_otha));
974 
975   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
976   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
977 
978   ptap->reuse = MAT_REUSE_MATRIX;
979   PetscFunctionReturn(0);
980 }
981 
982 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) {
983   PetscFunctionBegin;
984 
985   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
986   PetscFunctionReturn(0);
987 }
988 
989 /* TODO: move algorithm selection to MatProductSetFromOptions */
990 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) {
991   Mat_APMPI      *ptap;
992   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
993   MPI_Comm        comm;
994   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
995   MatType         mtype;
996   PetscSF         sf;
997   PetscSFNode    *iremote;
998   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
999   const PetscInt *rootdegrees;
1000   PetscHSetI      ht, oht, *hta, *hto;
1001   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1002   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1003   PetscInt        nalg = 2, alg = 0, offset, ii;
1004   PetscMPIInt     owner;
1005   const PetscInt *mappingindices;
1006   PetscBool       flg;
1007   const char     *algTypes[2] = {"overlapping", "merged"};
1008   IS              map;
1009 
1010   PetscFunctionBegin;
1011   MatCheckProduct(Cmpi, 5);
1012   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1013   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1014 
1015   /* Create symbolic parallel matrix Cmpi */
1016   PetscCall(MatGetLocalSize(P, NULL, &pn));
1017   pn *= dof;
1018   PetscCall(MatGetType(A, &mtype));
1019   PetscCall(MatSetType(Cmpi, mtype));
1020   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1021 
1022   PetscCall(PetscNew(&ptap));
1023   ptap->reuse   = MAT_INITIAL_MATRIX;
1024   ptap->algType = 2;
1025 
1026   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1027   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1028   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1029   /* This equals to the number of offdiag columns in P */
1030   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1031   pon *= dof;
1032   /* offsets */
1033   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1034   /* The number of columns we will send to remote ranks */
1035   PetscCall(PetscMalloc1(pon, &c_rmtc));
1036   PetscCall(PetscMalloc1(pon, &hta));
1037   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1038   PetscCall(MatGetLocalSize(A, &am, NULL));
1039   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1040   /* Create hash table to merge all columns for C(i, :) */
1041   PetscCall(PetscHSetICreate(&ht));
1042 
1043   PetscCall(ISGetIndices(map, &mappingindices));
1044   ptap->c_rmti[0] = 0;
1045   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1046   for (i = 0; i < am && pon; i++) {
1047     /* Form one row of AP */
1048     PetscCall(PetscHSetIClear(ht));
1049     offset = i % dof;
1050     ii     = i / dof;
1051     /* If the off diag is empty, we should not do any calculation */
1052     nzi    = po->i[ii + 1] - po->i[ii];
1053     if (!nzi) continue;
1054 
1055     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1056     PetscCall(PetscHSetIGetSize(ht, &htsize));
1057     /* If AP is empty, just continue */
1058     if (!htsize) continue;
1059     /* Form C(ii, :) */
1060     poj = po->j + po->i[ii];
1061     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1062   }
1063 
1064   for (i = 0; i < pon; i++) {
1065     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1066     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1067     c_rmtc[i]           = htsize;
1068   }
1069 
1070   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1071 
1072   for (i = 0; i < pon; i++) {
1073     off = 0;
1074     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1075     PetscCall(PetscHSetIDestroy(&hta[i]));
1076   }
1077   PetscCall(PetscFree(hta));
1078 
1079   PetscCall(PetscMalloc1(pon, &iremote));
1080   for (i = 0; i < pon; i++) {
1081     owner  = 0;
1082     lidx   = 0;
1083     offset = i % dof;
1084     ii     = i / dof;
1085     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1086     iremote[i].index = lidx * dof + offset;
1087     iremote[i].rank  = owner;
1088   }
1089 
1090   PetscCall(PetscSFCreate(comm, &sf));
1091   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1092   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1093   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1094   PetscCall(PetscSFSetFromOptions(sf));
1095   PetscCall(PetscSFSetUp(sf));
1096   /* How many neighbors have contributions to my rows? */
1097   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1098   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1099   rootspacesize = 0;
1100   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1101   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1102   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1103   /* Get information from leaves
1104    * Number of columns other people contribute to my rows
1105    * */
1106   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1107   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1108   PetscCall(PetscFree(c_rmtc));
1109   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1110   /* The number of columns is received for each row */
1111   ptap->c_othi[0]     = 0;
1112   rootspacesize       = 0;
1113   rootspaceoffsets[0] = 0;
1114   for (i = 0; i < pn; i++) {
1115     rcvncols = 0;
1116     for (j = 0; j < rootdegrees[i]; j++) {
1117       rcvncols += rootspace[rootspacesize];
1118       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1119       rootspacesize++;
1120     }
1121     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1122   }
1123   PetscCall(PetscFree(rootspace));
1124 
1125   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1126   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1127   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1128   PetscCall(PetscSFDestroy(&sf));
1129   PetscCall(PetscFree(rootspaceoffsets));
1130 
1131   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1132   nleaves = 0;
1133   for (i = 0; i < pon; i++) {
1134     owner = 0;
1135     ii    = i / dof;
1136     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1137     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1138     for (j = 0; j < sendncols; j++) {
1139       iremote[nleaves].rank    = owner;
1140       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1141     }
1142   }
1143   PetscCall(PetscFree(c_rmtoffsets));
1144   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1145 
1146   PetscCall(PetscSFCreate(comm, &ptap->sf));
1147   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1148   PetscCall(PetscSFSetFromOptions(ptap->sf));
1149   /* One to one map */
1150   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1151 
1152   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1153   PetscCall(PetscHSetICreate(&oht));
1154   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1155   pcstart *= dof;
1156   pcend *= dof;
1157   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1158   for (i = 0; i < pn; i++) {
1159     PetscCall(PetscHSetICreate(&hta[i]));
1160     PetscCall(PetscHSetICreate(&hto[i]));
1161   }
1162   /* Work on local part */
1163   /* 4) Pass 1: Estimate memory for C_loc */
1164   for (i = 0; i < am && pn; i++) {
1165     PetscCall(PetscHSetIClear(ht));
1166     PetscCall(PetscHSetIClear(oht));
1167     offset = i % dof;
1168     ii     = i / dof;
1169     nzi    = pd->i[ii + 1] - pd->i[ii];
1170     if (!nzi) continue;
1171 
1172     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1173     PetscCall(PetscHSetIGetSize(ht, &htsize));
1174     PetscCall(PetscHSetIGetSize(oht, &htosize));
1175     if (!(htsize + htosize)) continue;
1176     /* Form C(ii, :) */
1177     pdj = pd->j + pd->i[ii];
1178     for (j = 0; j < nzi; j++) {
1179       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1180       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1181     }
1182   }
1183 
1184   PetscCall(ISRestoreIndices(map, &mappingindices));
1185 
1186   PetscCall(PetscHSetIDestroy(&ht));
1187   PetscCall(PetscHSetIDestroy(&oht));
1188 
1189   /* Get remote data */
1190   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1191   PetscCall(PetscFree(c_rmtj));
1192 
1193   for (i = 0; i < pn; i++) {
1194     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1195     rdj = c_othj + ptap->c_othi[i];
1196     for (j = 0; j < nzi; j++) {
1197       col = rdj[j];
1198       /* diag part */
1199       if (col >= pcstart && col < pcend) {
1200         PetscCall(PetscHSetIAdd(hta[i], col));
1201       } else { /* off diag */
1202         PetscCall(PetscHSetIAdd(hto[i], col));
1203       }
1204     }
1205     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1206     dnz[i] = htsize;
1207     PetscCall(PetscHSetIDestroy(&hta[i]));
1208     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1209     onz[i] = htsize;
1210     PetscCall(PetscHSetIDestroy(&hto[i]));
1211   }
1212 
1213   PetscCall(PetscFree2(hta, hto));
1214   PetscCall(PetscFree(c_othj));
1215 
1216   /* local sizes and preallocation */
1217   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1218   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1219   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1220   PetscCall(MatSetUp(Cmpi));
1221   PetscCall(PetscFree2(dnz, onz));
1222 
1223   /* attach the supporting struct to Cmpi for reuse */
1224   Cmpi->product->data    = ptap;
1225   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1226   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1227 
1228   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1229   Cmpi->assembled = PETSC_FALSE;
1230   /* pick an algorithm */
1231   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1232   alg = 0;
1233   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1234   PetscOptionsEnd();
1235   switch (alg) {
1236   case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break;
1237   case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break;
1238   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1239   }
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) {
1244   PetscFunctionBegin;
1245   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) {
1250   Mat_APMPI      *ptap;
1251   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1252   MPI_Comm        comm;
1253   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1254   MatType         mtype;
1255   PetscSF         sf;
1256   PetscSFNode    *iremote;
1257   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1258   const PetscInt *rootdegrees;
1259   PetscHSetI      ht, oht, *hta, *hto, *htd;
1260   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1261   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1262   PetscInt        nalg = 2, alg = 0, offset, ii;
1263   PetscMPIInt     owner;
1264   PetscBool       flg;
1265   const char     *algTypes[2] = {"merged", "overlapping"};
1266   const PetscInt *mappingindices;
1267   IS              map;
1268 
1269   PetscFunctionBegin;
1270   MatCheckProduct(Cmpi, 5);
1271   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1272   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1273 
1274   /* Create symbolic parallel matrix Cmpi */
1275   PetscCall(MatGetLocalSize(P, NULL, &pn));
1276   pn *= dof;
1277   PetscCall(MatGetType(A, &mtype));
1278   PetscCall(MatSetType(Cmpi, mtype));
1279   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1280 
1281   PetscCall(PetscNew(&ptap));
1282   ptap->reuse   = MAT_INITIAL_MATRIX;
1283   ptap->algType = 3;
1284 
1285   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1286   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1287   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1288 
1289   /* This equals to the number of offdiag columns in P */
1290   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1291   pon *= dof;
1292   /* offsets */
1293   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1294   /* The number of columns we will send to remote ranks */
1295   PetscCall(PetscMalloc1(pon, &c_rmtc));
1296   PetscCall(PetscMalloc1(pon, &hta));
1297   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1298   PetscCall(MatGetLocalSize(A, &am, NULL));
1299   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1300   /* Create hash table to merge all columns for C(i, :) */
1301   PetscCall(PetscHSetICreate(&ht));
1302   PetscCall(PetscHSetICreate(&oht));
1303   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1304   for (i = 0; i < pn; i++) {
1305     PetscCall(PetscHSetICreate(&htd[i]));
1306     PetscCall(PetscHSetICreate(&hto[i]));
1307   }
1308 
1309   PetscCall(ISGetIndices(map, &mappingindices));
1310   ptap->c_rmti[0] = 0;
1311   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1312   for (i = 0; i < am && (pon || pn); i++) {
1313     /* Form one row of AP */
1314     PetscCall(PetscHSetIClear(ht));
1315     PetscCall(PetscHSetIClear(oht));
1316     offset = i % dof;
1317     ii     = i / dof;
1318     /* If the off diag is empty, we should not do any calculation */
1319     nzi    = po->i[ii + 1] - po->i[ii];
1320     dnzi   = pd->i[ii + 1] - pd->i[ii];
1321     if (!nzi && !dnzi) continue;
1322 
1323     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1324     PetscCall(PetscHSetIGetSize(ht, &htsize));
1325     PetscCall(PetscHSetIGetSize(oht, &htosize));
1326     /* If AP is empty, just continue */
1327     if (!(htsize + htosize)) continue;
1328 
1329     /* Form remote C(ii, :) */
1330     poj = po->j + po->i[ii];
1331     for (j = 0; j < nzi; j++) {
1332       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1333       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1334     }
1335 
1336     /* Form local C(ii, :) */
1337     pdj = pd->j + pd->i[ii];
1338     for (j = 0; j < dnzi; j++) {
1339       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1340       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1341     }
1342   }
1343 
1344   PetscCall(ISRestoreIndices(map, &mappingindices));
1345 
1346   PetscCall(PetscHSetIDestroy(&ht));
1347   PetscCall(PetscHSetIDestroy(&oht));
1348 
1349   for (i = 0; i < pon; i++) {
1350     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1351     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1352     c_rmtc[i]           = htsize;
1353   }
1354 
1355   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1356 
1357   for (i = 0; i < pon; i++) {
1358     off = 0;
1359     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1360     PetscCall(PetscHSetIDestroy(&hta[i]));
1361   }
1362   PetscCall(PetscFree(hta));
1363 
1364   PetscCall(PetscMalloc1(pon, &iremote));
1365   for (i = 0; i < pon; i++) {
1366     owner  = 0;
1367     lidx   = 0;
1368     offset = i % dof;
1369     ii     = i / dof;
1370     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1371     iremote[i].index = lidx * dof + offset;
1372     iremote[i].rank  = owner;
1373   }
1374 
1375   PetscCall(PetscSFCreate(comm, &sf));
1376   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1377   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1378   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1379   PetscCall(PetscSFSetFromOptions(sf));
1380   PetscCall(PetscSFSetUp(sf));
1381   /* How many neighbors have contributions to my rows? */
1382   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1383   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1384   rootspacesize = 0;
1385   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1386   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1387   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1388   /* Get information from leaves
1389    * Number of columns other people contribute to my rows
1390    * */
1391   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1392   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1393   PetscCall(PetscFree(c_rmtc));
1394   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1395   /* The number of columns is received for each row */
1396   ptap->c_othi[0]     = 0;
1397   rootspacesize       = 0;
1398   rootspaceoffsets[0] = 0;
1399   for (i = 0; i < pn; i++) {
1400     rcvncols = 0;
1401     for (j = 0; j < rootdegrees[i]; j++) {
1402       rcvncols += rootspace[rootspacesize];
1403       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1404       rootspacesize++;
1405     }
1406     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1407   }
1408   PetscCall(PetscFree(rootspace));
1409 
1410   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1411   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1412   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1413   PetscCall(PetscSFDestroy(&sf));
1414   PetscCall(PetscFree(rootspaceoffsets));
1415 
1416   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1417   nleaves = 0;
1418   for (i = 0; i < pon; i++) {
1419     owner = 0;
1420     ii    = i / dof;
1421     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1422     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1423     for (j = 0; j < sendncols; j++) {
1424       iremote[nleaves].rank    = owner;
1425       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1426     }
1427   }
1428   PetscCall(PetscFree(c_rmtoffsets));
1429   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1430 
1431   PetscCall(PetscSFCreate(comm, &ptap->sf));
1432   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1433   PetscCall(PetscSFSetFromOptions(ptap->sf));
1434   /* One to one map */
1435   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1436   /* Get remote data */
1437   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1438   PetscCall(PetscFree(c_rmtj));
1439   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1440   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1441   pcstart *= dof;
1442   pcend *= dof;
1443   for (i = 0; i < pn; i++) {
1444     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1445     rdj = c_othj + ptap->c_othi[i];
1446     for (j = 0; j < nzi; j++) {
1447       col = rdj[j];
1448       /* diag part */
1449       if (col >= pcstart && col < pcend) {
1450         PetscCall(PetscHSetIAdd(htd[i], col));
1451       } else { /* off diag */
1452         PetscCall(PetscHSetIAdd(hto[i], col));
1453       }
1454     }
1455     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1456     dnz[i] = htsize;
1457     PetscCall(PetscHSetIDestroy(&htd[i]));
1458     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1459     onz[i] = htsize;
1460     PetscCall(PetscHSetIDestroy(&hto[i]));
1461   }
1462 
1463   PetscCall(PetscFree2(htd, hto));
1464   PetscCall(PetscFree(c_othj));
1465 
1466   /* local sizes and preallocation */
1467   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1468   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1469   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1470   PetscCall(PetscFree2(dnz, onz));
1471 
1472   /* attach the supporting struct to Cmpi for reuse */
1473   Cmpi->product->data    = ptap;
1474   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1475   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1476 
1477   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1478   Cmpi->assembled = PETSC_FALSE;
1479   /* pick an algorithm */
1480   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1481   alg = 0;
1482   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1483   PetscOptionsEnd();
1484   switch (alg) {
1485   case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break;
1486   case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break;
1487   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1488   }
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) {
1493   PetscFunctionBegin;
1494   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) {
1499   Mat_APMPI               *ptap;
1500   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1501   MPI_Comm                 comm;
1502   PetscMPIInt              size, rank;
1503   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1504   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1505   PetscInt                *lnk, i, k, pnz, row, nsend;
1506   PetscBT                  lnkbt;
1507   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1508   PETSC_UNUSED PetscMPIInt icompleted = 0;
1509   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1510   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1511   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1512   MPI_Request             *swaits, *rwaits;
1513   MPI_Status              *sstatus, rstatus;
1514   PetscLayout              rowmap;
1515   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1516   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1517   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1518   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1519   PetscScalar             *apv;
1520   PetscTable               ta;
1521   MatType                  mtype;
1522   const char              *prefix;
1523 #if defined(PETSC_USE_INFO)
1524   PetscReal apfill;
1525 #endif
1526 
1527   PetscFunctionBegin;
1528   MatCheckProduct(Cmpi, 4);
1529   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1530   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1531   PetscCallMPI(MPI_Comm_size(comm, &size));
1532   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1533 
1534   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1535 
1536   /* create symbolic parallel matrix Cmpi */
1537   PetscCall(MatGetType(A, &mtype));
1538   PetscCall(MatSetType(Cmpi, mtype));
1539 
1540   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1541   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1542 
1543   /* create struct Mat_APMPI and attached it to C later */
1544   PetscCall(PetscNew(&ptap));
1545   ptap->reuse   = MAT_INITIAL_MATRIX;
1546   ptap->algType = 1;
1547 
1548   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1549   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1550   /* get P_loc by taking all local rows of P */
1551   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1552 
1553   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1554   /* --------------------------------- */
1555   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1556   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1557 
1558   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1559   /* ----------------------------------------------------------------- */
1560   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1561   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1562 
1563   /* create and initialize a linked list */
1564   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
1565   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1566   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1567   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1568   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1569 
1570   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1571 
1572   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1573   if (ao) {
1574     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1575   } else {
1576     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1577   }
1578   current_space = free_space;
1579   nspacedouble  = 0;
1580 
1581   PetscCall(PetscMalloc1(am + 1, &api));
1582   api[0] = 0;
1583   for (i = 0; i < am; i++) {
1584     /* diagonal portion: Ad[i,:]*P */
1585     ai  = ad->i;
1586     pi  = p_loc->i;
1587     nzi = ai[i + 1] - ai[i];
1588     aj  = ad->j + ai[i];
1589     for (j = 0; j < nzi; j++) {
1590       row  = aj[j];
1591       pnz  = pi[row + 1] - pi[row];
1592       Jptr = p_loc->j + pi[row];
1593       /* add non-zero cols of P into the sorted linked list lnk */
1594       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1595     }
1596     /* off-diagonal portion: Ao[i,:]*P */
1597     if (ao) {
1598       ai  = ao->i;
1599       pi  = p_oth->i;
1600       nzi = ai[i + 1] - ai[i];
1601       aj  = ao->j + ai[i];
1602       for (j = 0; j < nzi; j++) {
1603         row  = aj[j];
1604         pnz  = pi[row + 1] - pi[row];
1605         Jptr = p_oth->j + pi[row];
1606         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1607       }
1608     }
1609     apnz       = lnk[0];
1610     api[i + 1] = api[i] + apnz;
1611     if (ap_rmax < apnz) ap_rmax = apnz;
1612 
1613     /* if free space is not available, double the total space in the list */
1614     if (current_space->local_remaining < apnz) {
1615       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1616       nspacedouble++;
1617     }
1618 
1619     /* Copy data into free space, then initialize lnk */
1620     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1621 
1622     current_space->array += apnz;
1623     current_space->local_used += apnz;
1624     current_space->local_remaining -= apnz;
1625   }
1626   /* Allocate space for apj and apv, initialize apj, and */
1627   /* destroy list of free space and other temporary array(s) */
1628   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1629   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1630   PetscCall(PetscLLDestroy(lnk, lnkbt));
1631 
1632   /* Create AP_loc for reuse */
1633   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1634   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1635 #if defined(PETSC_USE_INFO)
1636   if (ao) {
1637     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1638   } else {
1639     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1640   }
1641   ptap->AP_loc->info.mallocs           = nspacedouble;
1642   ptap->AP_loc->info.fill_ratio_given  = fill;
1643   ptap->AP_loc->info.fill_ratio_needed = apfill;
1644 
1645   if (api[am]) {
1646     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1647     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1648   } else {
1649     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1650   }
1651 #endif
1652 
1653   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1654   /* ------------------------------------ */
1655   PetscCall(MatGetOptionsPrefix(A, &prefix));
1656   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1657   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1658   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1659   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1660   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1661   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1662   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1663   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1664   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1665   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1666   PetscCall(MatProductSymbolic(ptap->C_oth));
1667 
1668   /* (3) send coj of C_oth to other processors  */
1669   /* ------------------------------------------ */
1670   /* determine row ownership */
1671   PetscCall(PetscLayoutCreate(comm, &rowmap));
1672   rowmap->n  = pn;
1673   rowmap->bs = 1;
1674   PetscCall(PetscLayoutSetUp(rowmap));
1675   owners = rowmap->range;
1676 
1677   /* determine the number of messages to send, their lengths */
1678   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1679   PetscCall(PetscArrayzero(len_s, size));
1680   PetscCall(PetscArrayzero(len_si, size));
1681 
1682   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1683   coi   = c_oth->i;
1684   coj   = c_oth->j;
1685   con   = ptap->C_oth->rmap->n;
1686   proc  = 0;
1687   for (i = 0; i < con; i++) {
1688     while (prmap[i] >= owners[proc + 1]) proc++;
1689     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1690     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1691   }
1692 
1693   len          = 0; /* max length of buf_si[], see (4) */
1694   owners_co[0] = 0;
1695   nsend        = 0;
1696   for (proc = 0; proc < size; proc++) {
1697     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1698     if (len_s[proc]) {
1699       nsend++;
1700       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1701       len += len_si[proc];
1702     }
1703   }
1704 
1705   /* determine the number and length of messages to receive for coi and coj  */
1706   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1707   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1708 
1709   /* post the Irecv and Isend of coj */
1710   PetscCall(PetscCommGetNewTag(comm, &tagj));
1711   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1712   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1713   for (proc = 0, k = 0; proc < size; proc++) {
1714     if (!len_s[proc]) continue;
1715     i = owners_co[proc];
1716     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1717     k++;
1718   }
1719 
1720   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1721   /* ---------------------------------------- */
1722   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1723   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1724   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1725   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1726   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1727   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1728   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1729   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1730   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1731   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1732   PetscCall(MatProductSymbolic(ptap->C_loc));
1733 
1734   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1735 
1736   /* receives coj are complete */
1737   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1738   PetscCall(PetscFree(rwaits));
1739   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1740 
1741   /* add received column indices into ta to update Crmax */
1742   for (k = 0; k < nrecv; k++) { /* k-th received message */
1743     Jptr = buf_rj[k];
1744     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
1745   }
1746   PetscCall(PetscTableGetCount(ta, &Crmax));
1747   PetscCall(PetscTableDestroy(&ta));
1748 
1749   /* (4) send and recv coi */
1750   /*-----------------------*/
1751   PetscCall(PetscCommGetNewTag(comm, &tagi));
1752   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1753   PetscCall(PetscMalloc1(len + 1, &buf_s));
1754   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1755   for (proc = 0, k = 0; proc < size; proc++) {
1756     if (!len_s[proc]) continue;
1757     /* form outgoing message for i-structure:
1758          buf_si[0]:                 nrows to be sent
1759                [1:nrows]:           row index (global)
1760                [nrows+1:2*nrows+1]: i-structure index
1761     */
1762     /*-------------------------------------------*/
1763     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1764     buf_si_i    = buf_si + nrows + 1;
1765     buf_si[0]   = nrows;
1766     buf_si_i[0] = 0;
1767     nrows       = 0;
1768     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1769       nzi                 = coi[i + 1] - coi[i];
1770       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1771       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1772       nrows++;
1773     }
1774     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1775     k++;
1776     buf_si += len_si[proc];
1777   }
1778   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1779   PetscCall(PetscFree(rwaits));
1780   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1781 
1782   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1783   PetscCall(PetscFree(len_ri));
1784   PetscCall(PetscFree(swaits));
1785   PetscCall(PetscFree(buf_s));
1786 
1787   /* (5) compute the local portion of Cmpi      */
1788   /* ------------------------------------------ */
1789   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1790   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1791   current_space = free_space;
1792 
1793   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1794   for (k = 0; k < nrecv; k++) {
1795     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1796     nrows       = *buf_ri_k[k];
1797     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1798     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1799   }
1800 
1801   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1802   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1803   for (i = 0; i < pn; i++) {
1804     /* add C_loc into Cmpi */
1805     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1806     Jptr = c_loc->j + c_loc->i[i];
1807     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1808 
1809     /* add received col data into lnk */
1810     for (k = 0; k < nrecv; k++) { /* k-th received message */
1811       if (i == *nextrow[k]) {     /* i-th row */
1812         nzi  = *(nextci[k] + 1) - *nextci[k];
1813         Jptr = buf_rj[k] + *nextci[k];
1814         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1815         nextrow[k]++;
1816         nextci[k]++;
1817       }
1818     }
1819     nzi = lnk[0];
1820 
1821     /* copy data into free space, then initialize lnk */
1822     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1823     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1824   }
1825   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1826   PetscCall(PetscLLDestroy(lnk, lnkbt));
1827   PetscCall(PetscFreeSpaceDestroy(free_space));
1828 
1829   /* local sizes and preallocation */
1830   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1831   if (P->cmap->bs > 0) {
1832     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1833     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1834   }
1835   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1836   MatPreallocateEnd(dnz, onz);
1837 
1838   /* members in merge */
1839   PetscCall(PetscFree(id_r));
1840   PetscCall(PetscFree(len_r));
1841   PetscCall(PetscFree(buf_ri[0]));
1842   PetscCall(PetscFree(buf_ri));
1843   PetscCall(PetscFree(buf_rj[0]));
1844   PetscCall(PetscFree(buf_rj));
1845   PetscCall(PetscLayoutDestroy(&rowmap));
1846 
1847   PetscCall(PetscCalloc1(pN, &ptap->apa));
1848 
1849   /* attach the supporting struct to Cmpi for reuse */
1850   Cmpi->product->data    = ptap;
1851   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1852   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1853 
1854   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1855   Cmpi->assembled = PETSC_FALSE;
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) {
1860   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1861   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1862   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1863   Mat_APMPI         *ptap;
1864   Mat                AP_loc, C_loc, C_oth;
1865   PetscInt           i, rstart, rend, cm, ncols, row;
1866   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1867   PetscScalar       *apa;
1868   const PetscInt    *cols;
1869   const PetscScalar *vals;
1870 
1871   PetscFunctionBegin;
1872   MatCheckProduct(C, 3);
1873   ptap = (Mat_APMPI *)C->product->data;
1874   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1875   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1876 
1877   PetscCall(MatZeroEntries(C));
1878   /* 1) get R = Pd^T,Ro = Po^T */
1879   if (ptap->reuse == MAT_REUSE_MATRIX) {
1880     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1881     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1882   }
1883 
1884   /* 2) get AP_loc */
1885   AP_loc = ptap->AP_loc;
1886   ap     = (Mat_SeqAIJ *)AP_loc->data;
1887 
1888   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1889   /*-----------------------------------------------------*/
1890   if (ptap->reuse == MAT_REUSE_MATRIX) {
1891     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1892     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1893     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1894   }
1895 
1896   /* 2-2) compute numeric A_loc*P - dominating part */
1897   /* ---------------------------------------------- */
1898   /* get data from symbolic products */
1899   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1900   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1901   apa = ptap->apa;
1902   api = ap->i;
1903   apj = ap->j;
1904   for (i = 0; i < am; i++) {
1905     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1906     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1907     apnz = api[i + 1] - api[i];
1908     for (j = 0; j < apnz; j++) {
1909       col                 = apj[j + api[i]];
1910       ap->a[j + ap->i[i]] = apa[col];
1911       apa[col]            = 0.0;
1912     }
1913   }
1914   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1915   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1916 
1917   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1918   PetscCall(MatProductNumeric(ptap->C_loc));
1919   PetscCall(MatProductNumeric(ptap->C_oth));
1920   C_loc = ptap->C_loc;
1921   C_oth = ptap->C_oth;
1922 
1923   /* add C_loc and Co to to C */
1924   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1925 
1926   /* C_loc -> C */
1927   cm    = C_loc->rmap->N;
1928   c_seq = (Mat_SeqAIJ *)C_loc->data;
1929   cols  = c_seq->j;
1930   vals  = c_seq->a;
1931 
1932   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1933   /* when there are no off-processor parts.  */
1934   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1935   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1936   /* a table, and other, more complex stuff has to be done. */
1937   if (C->assembled) {
1938     C->was_assembled = PETSC_TRUE;
1939     C->assembled     = PETSC_FALSE;
1940   }
1941   if (C->was_assembled) {
1942     for (i = 0; i < cm; i++) {
1943       ncols = c_seq->i[i + 1] - c_seq->i[i];
1944       row   = rstart + i;
1945       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1946       cols += ncols;
1947       vals += ncols;
1948     }
1949   } else {
1950     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1951   }
1952 
1953   /* Co -> C, off-processor part */
1954   cm    = C_oth->rmap->N;
1955   c_seq = (Mat_SeqAIJ *)C_oth->data;
1956   cols  = c_seq->j;
1957   vals  = c_seq->a;
1958   for (i = 0; i < cm; i++) {
1959     ncols = c_seq->i[i + 1] - c_seq->i[i];
1960     row   = p->garray[i];
1961     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1962     cols += ncols;
1963     vals += ncols;
1964   }
1965 
1966   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1967   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1968 
1969   ptap->reuse = MAT_REUSE_MATRIX;
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) {
1974   Mat_Product        *product = C->product;
1975   Mat                 A = product->A, P = product->B;
1976   MatProductAlgorithm alg  = product->alg;
1977   PetscReal           fill = product->fill;
1978   PetscBool           flg;
1979 
1980   PetscFunctionBegin;
1981   /* scalable: do R=P^T locally, then C=R*A*P */
1982   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1983   if (flg) {
1984     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1985     C->ops->productnumeric = MatProductNumeric_PtAP;
1986     goto next;
1987   }
1988 
1989   /* nonscalable: do R=P^T locally, then C=R*A*P */
1990   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1991   if (flg) {
1992     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1993     goto next;
1994   }
1995 
1996   /* allatonce */
1997   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
1998   if (flg) {
1999     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
2000     goto next;
2001   }
2002 
2003   /* allatonce_merged */
2004   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2005   if (flg) {
2006     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2007     goto next;
2008   }
2009 
2010   /* backend general code */
2011   PetscCall(PetscStrcmp(alg, "backend", &flg));
2012   if (flg) {
2013     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2014     PetscFunctionReturn(0);
2015   }
2016 
2017   /* hypre */
2018 #if defined(PETSC_HAVE_HYPRE)
2019   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2020   if (flg) {
2021     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2022     C->ops->productnumeric = MatProductNumeric_PtAP;
2023     PetscFunctionReturn(0);
2024   }
2025 #endif
2026   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2027 
2028 next:
2029   C->ops->productnumeric = MatProductNumeric_PtAP;
2030   PetscFunctionReturn(0);
2031 }
2032