xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 2ff79c18c26c94ed8cb599682f680f231dca6444)
1 /*
2   Defines projective product routines where A is a MPIAIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <petsctime.h>
11 #include <petsc/private/hashmapiv.h>
12 #include <petsc/private/hashseti.h>
13 #include <petscsf.h>
14 
15 static PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
16 {
17   PetscBool         isascii;
18   PetscViewerFormat format;
19   Mat_APMPI        *ptap;
20 
21   PetscFunctionBegin;
22   MatCheckProduct(A, 1);
23   ptap = (Mat_APMPI *)A->product->data;
24   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
25   if (isascii) {
26     PetscCall(PetscViewerGetFormat(viewer, &format));
27     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28       if (ptap->algType == 0) {
29         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
30       } else if (ptap->algType == 1) {
31         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
32       } else if (ptap->algType == 2) {
33         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
34       } else if (ptap->algType == 3) {
35         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
36       }
37     }
38   }
39   PetscFunctionReturn(PETSC_SUCCESS);
40 }
41 
42 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
43 {
44   Mat_APMPI           *ptap = (Mat_APMPI *)data;
45   Mat_Merge_SeqsToMPI *merge;
46 
47   PetscFunctionBegin;
48   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
49   PetscCall(PetscFree(ptap->bufa));
50   PetscCall(MatDestroy(&ptap->P_loc));
51   PetscCall(MatDestroy(&ptap->P_oth));
52   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
53   PetscCall(MatDestroy(&ptap->Rd));
54   PetscCall(MatDestroy(&ptap->Ro));
55   if (ptap->AP_loc) { /* used by alg_rap */
56     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)ptap->AP_loc->data;
57     PetscCall(PetscFree(ap->i));
58     PetscCall(PetscFree2(ap->j, ap->a));
59     PetscCall(MatDestroy(&ptap->AP_loc));
60   } else { /* used by alg_ptap */
61     PetscCall(PetscFree(ptap->api));
62     PetscCall(PetscFree(ptap->apj));
63   }
64   PetscCall(MatDestroy(&ptap->C_loc));
65   PetscCall(MatDestroy(&ptap->C_oth));
66   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
67 
68   PetscCall(MatDestroy(&ptap->Pt));
69 
70   merge = ptap->merge;
71   if (merge) { /* used by alg_ptap */
72     PetscCall(PetscFree(merge->id_r));
73     PetscCall(PetscFree(merge->len_s));
74     PetscCall(PetscFree(merge->len_r));
75     PetscCall(PetscFree(merge->bi));
76     PetscCall(PetscFree(merge->bj));
77     PetscCall(PetscFree(merge->buf_ri[0]));
78     PetscCall(PetscFree(merge->buf_ri));
79     PetscCall(PetscFree(merge->buf_rj[0]));
80     PetscCall(PetscFree(merge->buf_rj));
81     PetscCall(PetscFree(merge->coi));
82     PetscCall(PetscFree(merge->coj));
83     PetscCall(PetscFree(merge->owners_co));
84     PetscCall(PetscLayoutDestroy(&merge->rowmap));
85     PetscCall(PetscFree(ptap->merge));
86   }
87   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
88 
89   PetscCall(PetscSFDestroy(&ptap->sf));
90   PetscCall(PetscFree(ptap->c_othi));
91   PetscCall(PetscFree(ptap->c_rmti));
92   PetscCall(PetscFree(ptap));
93   PetscFunctionReturn(PETSC_SUCCESS);
94 }
95 
96 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
97 {
98   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
99   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
100   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
101   Mat_APMPI         *ptap;
102   Mat                AP_loc, C_loc, C_oth;
103   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
104   PetscScalar       *apa;
105   const PetscInt    *cols;
106   const PetscScalar *vals;
107 
108   PetscFunctionBegin;
109   MatCheckProduct(C, 3);
110   ptap = (Mat_APMPI *)C->product->data;
111   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
112   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
113 
114   PetscCall(MatZeroEntries(C));
115 
116   /* 1) get R = Pd^T,Ro = Po^T */
117   if (ptap->reuse == MAT_REUSE_MATRIX) {
118     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
119     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
120   }
121 
122   /* 2) get AP_loc */
123   AP_loc = ptap->AP_loc;
124   ap     = (Mat_SeqAIJ *)AP_loc->data;
125 
126   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
127   if (ptap->reuse == MAT_REUSE_MATRIX) {
128     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
129     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
130     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
131   }
132 
133   /* 2-2) compute numeric A_loc*P - dominating part */
134   /* get data from symbolic products */
135   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
136   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
137 
138   api = ap->i;
139   apj = ap->j;
140   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
141   for (i = 0; i < am; i++) {
142     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143     apnz = api[i + 1] - api[i];
144     apa  = ap->a + api[i];
145     PetscCall(PetscArrayzero(apa, apnz));
146     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147   }
148   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
149   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
150 
151   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152   /* Always use scalable version since we are in the MPI scalable version */
153   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
154   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
155 
156   C_loc = ptap->C_loc;
157   C_oth = ptap->C_oth;
158 
159   /* add C_loc and Co to C */
160   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
161 
162   /* C_loc -> C */
163   cm    = C_loc->rmap->N;
164   c_seq = (Mat_SeqAIJ *)C_loc->data;
165   cols  = c_seq->j;
166   vals  = c_seq->a;
167   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
168 
169   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170   /* when there are no off-processor parts.  */
171   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173   /* a table, and other, more complex stuff has to be done. */
174   if (C->assembled) {
175     C->was_assembled = PETSC_TRUE;
176     C->assembled     = PETSC_FALSE;
177   }
178   if (C->was_assembled) {
179     for (i = 0; i < cm; i++) {
180       ncols = c_seq->i[i + 1] - c_seq->i[i];
181       row   = rstart + i;
182       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
183       cols += ncols;
184       vals += ncols;
185     }
186   } else {
187     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
188   }
189   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
190   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
191 
192   /* Co -> C, off-processor part */
193   cm    = C_oth->rmap->N;
194   c_seq = (Mat_SeqAIJ *)C_oth->data;
195   cols  = c_seq->j;
196   vals  = c_seq->a;
197   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
198   for (i = 0; i < cm; i++) {
199     ncols = c_seq->i[i + 1] - c_seq->i[i];
200     row   = p->garray[i];
201     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
202     cols += ncols;
203     vals += ncols;
204   }
205   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
206   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
207 
208   ptap->reuse = MAT_REUSE_MATRIX;
209 
210   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
211   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
216 {
217   Mat_APMPI               *ptap;
218   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
219   MPI_Comm                 comm;
220   PetscMPIInt              size, rank;
221   Mat                      P_loc, P_oth;
222   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
223   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
224   PetscInt                *lnk, i, k, pnz, row;
225   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv, nsend, proc;
226   PETSC_UNUSED PetscMPIInt icompleted = 0;
227   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
228   const PetscInt          *owners;
229   PetscInt                 len, *dnz, *onz, nzi, nspacedouble;
230   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
231   MPI_Request             *swaits, *rwaits;
232   MPI_Status              *sstatus, rstatus;
233   PetscLayout              rowmap;
234   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
235   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
236   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
237   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
238   PetscScalar             *apv;
239   PetscHMapI               ta;
240   MatType                  mtype;
241   const char              *prefix;
242 #if defined(PETSC_USE_INFO)
243   PetscReal apfill;
244 #endif
245 
246   PetscFunctionBegin;
247   MatCheckProduct(Cmpi, 4);
248   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
249   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
250   PetscCallMPI(MPI_Comm_size(comm, &size));
251   PetscCallMPI(MPI_Comm_rank(comm, &rank));
252 
253   if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;
254 
255   /* create symbolic parallel matrix Cmpi */
256   PetscCall(MatGetType(A, &mtype));
257   PetscCall(MatSetType(Cmpi, mtype));
258 
259   /* create struct Mat_APMPI and attached it to C later */
260   PetscCall(PetscNew(&ptap));
261   ptap->reuse   = MAT_INITIAL_MATRIX;
262   ptap->algType = 0;
263 
264   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
265   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
266   /* get P_loc by taking all local rows of P */
267   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
268 
269   ptap->P_loc = P_loc;
270   ptap->P_oth = P_oth;
271 
272   /* (0) compute Rd = Pd^T, Ro = Po^T  */
273   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
274   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
275 
276   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277   p_loc = (Mat_SeqAIJ *)P_loc->data;
278   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
279 
280   /* create and initialize a linked list */
281   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
282   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
283   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
284   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
285 
286   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
287 
288   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
289   if (ao) {
290     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
291   } else {
292     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
293   }
294   current_space = free_space;
295   nspacedouble  = 0;
296 
297   PetscCall(PetscMalloc1(am + 1, &api));
298   api[0] = 0;
299   for (i = 0; i < am; i++) {
300     /* diagonal portion: Ad[i,:]*P */
301     ai  = ad->i;
302     pi  = p_loc->i;
303     nzi = ai[i + 1] - ai[i];
304     aj  = PetscSafePointerPlusOffset(ad->j, ai[i]);
305     for (j = 0; j < nzi; j++) {
306       row  = aj[j];
307       pnz  = pi[row + 1] - pi[row];
308       Jptr = p_loc->j + pi[row];
309       /* add non-zero cols of P into the sorted linked list lnk */
310       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
311     }
312     /* off-diagonal portion: Ao[i,:]*P */
313     if (ao) {
314       ai  = ao->i;
315       pi  = p_oth->i;
316       nzi = ai[i + 1] - ai[i];
317       aj  = PetscSafePointerPlusOffset(ao->j, ai[i]);
318       for (j = 0; j < nzi; j++) {
319         row  = aj[j];
320         pnz  = pi[row + 1] - pi[row];
321         Jptr = p_oth->j + pi[row];
322         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
323       }
324     }
325     apnz       = lnk[0];
326     api[i + 1] = api[i] + apnz;
327 
328     /* if free space is not available, double the total space in the list */
329     if (current_space->local_remaining < apnz) {
330       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
331       nspacedouble++;
332     }
333 
334     /* Copy data into free space, then initialize lnk */
335     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
336 
337     current_space->array += apnz;
338     current_space->local_used += apnz;
339     current_space->local_remaining -= apnz;
340   }
341   /* Allocate space for apj and apv, initialize apj, and */
342   /* destroy list of free space and other temporary array(s) */
343   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
344   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
345   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
346 
347   /* Create AP_loc for reuse */
348   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
349   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
350 
351 #if defined(PETSC_USE_INFO)
352   if (ao) {
353     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
354   } else {
355     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
356   }
357   ptap->AP_loc->info.mallocs           = nspacedouble;
358   ptap->AP_loc->info.fill_ratio_given  = fill;
359   ptap->AP_loc->info.fill_ratio_needed = apfill;
360 
361   if (api[am]) {
362     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
363     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
364   } else {
365     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
366   }
367 #endif
368 
369   /* (2-1) compute symbolic Co = Ro*AP_loc  */
370   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
371   PetscCall(MatGetOptionsPrefix(A, &prefix));
372   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
373   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
374 
375   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
376   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
377   PetscCall(MatProductSetFill(ptap->C_oth, fill));
378   PetscCall(MatProductSetFromOptions(ptap->C_oth));
379   PetscCall(MatProductSymbolic(ptap->C_oth));
380 
381   /* (3) send coj of C_oth to other processors  */
382   /* determine row ownership */
383   PetscCall(PetscLayoutCreate(comm, &rowmap));
384   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
385   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
386   PetscCall(PetscLayoutSetUp(rowmap));
387   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
388 
389   /* determine the number of messages to send, their lengths */
390   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
391   PetscCall(PetscArrayzero(len_s, size));
392   PetscCall(PetscArrayzero(len_si, size));
393 
394   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
395   coi   = c_oth->i;
396   coj   = c_oth->j;
397   con   = ptap->C_oth->rmap->n;
398   proc  = 0;
399   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
400   for (i = 0; i < con; i++) {
401     while (prmap[i] >= owners[proc + 1]) proc++;
402     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
403     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
404   }
405 
406   len          = 0; /* max length of buf_si[], see (4) */
407   owners_co[0] = 0;
408   nsend        = 0;
409   for (proc = 0; proc < size; proc++) {
410     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
411     if (len_s[proc]) {
412       nsend++;
413       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
414       len += len_si[proc];
415     }
416   }
417 
418   /* determine the number and length of messages to receive for coi and coj  */
419   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
420   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
421 
422   /* post the Irecv and Isend of coj */
423   PetscCall(PetscCommGetNewTag(comm, &tagj));
424   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
425   PetscCall(PetscMalloc1(nsend + 1, &swaits));
426   for (proc = 0, k = 0; proc < size; proc++) {
427     if (!len_s[proc]) continue;
428     i = owners_co[proc];
429     PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
430     k++;
431   }
432 
433   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
434   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
435   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
436   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
437   PetscCall(MatProductSetFill(ptap->C_loc, fill));
438 
439   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
440   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
441 
442   PetscCall(MatProductSetFromOptions(ptap->C_loc));
443   PetscCall(MatProductSymbolic(ptap->C_loc));
444 
445   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
446   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
447 
448   /* receives coj are complete */
449   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
450   PetscCall(PetscFree(rwaits));
451   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
452 
453   /* add received column indices into ta to update Crmax */
454   for (k = 0; k < nrecv; k++) { /* k-th received message */
455     Jptr = buf_rj[k];
456     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
457   }
458   PetscCall(PetscHMapIGetSize(ta, &Crmax));
459   PetscCall(PetscHMapIDestroy(&ta));
460 
461   /* (4) send and recv coi */
462   PetscCall(PetscCommGetNewTag(comm, &tagi));
463   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
464   PetscCall(PetscMalloc1(len + 1, &buf_s));
465   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
466   for (proc = 0, k = 0; proc < size; proc++) {
467     if (!len_s[proc]) continue;
468     /* form outgoing message for i-structure:
469          buf_si[0]:                 nrows to be sent
470                [1:nrows]:           row index (global)
471                [nrows+1:2*nrows+1]: i-structure index
472     */
473     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
474     buf_si_i    = buf_si + nrows + 1;
475     buf_si[0]   = nrows;
476     buf_si_i[0] = 0;
477     nrows       = 0;
478     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
479       nzi                 = coi[i + 1] - coi[i];
480       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
481       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
482       nrows++;
483     }
484     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
485     k++;
486     buf_si += len_si[proc];
487   }
488   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
489   PetscCall(PetscFree(rwaits));
490   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
491 
492   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
493   PetscCall(PetscFree(len_ri));
494   PetscCall(PetscFree(swaits));
495   PetscCall(PetscFree(buf_s));
496 
497   /* (5) compute the local portion of Cmpi      */
498   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
499   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
500   current_space = free_space;
501 
502   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
503   for (k = 0; k < nrecv; k++) {
504     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
505     nrows       = *buf_ri_k[k];
506     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
507     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
508   }
509 
510   MatPreallocateBegin(comm, pn, pn, dnz, onz);
511   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512   for (i = 0; i < pn; i++) {
513     /* add C_loc into Cmpi */
514     nzi  = c_loc->i[i + 1] - c_loc->i[i];
515     Jptr = c_loc->j + c_loc->i[i];
516     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
517 
518     /* add received col data into lnk */
519     for (k = 0; k < nrecv; k++) { /* k-th received message */
520       if (i == *nextrow[k]) {     /* i-th row */
521         nzi  = *(nextci[k] + 1) - *nextci[k];
522         Jptr = buf_rj[k] + *nextci[k];
523         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
524         nextrow[k]++;
525         nextci[k]++;
526       }
527     }
528     nzi = lnk[0];
529 
530     /* copy data into free space, then initialize lnk */
531     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
532     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
533   }
534   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
535   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
536   PetscCall(PetscFreeSpaceDestroy(free_space));
537 
538   /* local sizes and preallocation */
539   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
540   if (P->cmap->bs > 0) {
541     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
542     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
543   }
544   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
545   MatPreallocateEnd(dnz, onz);
546 
547   /* members in merge */
548   PetscCall(PetscFree(id_r));
549   PetscCall(PetscFree(len_r));
550   PetscCall(PetscFree(buf_ri[0]));
551   PetscCall(PetscFree(buf_ri));
552   PetscCall(PetscFree(buf_rj[0]));
553   PetscCall(PetscFree(buf_rj));
554   PetscCall(PetscLayoutDestroy(&rowmap));
555 
556   nout = 0;
557   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
558   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
559   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
560   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
561 
562   /* attach the supporting struct to Cmpi for reuse */
563   Cmpi->product->data    = ptap;
564   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
565   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
566 
567   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
568   Cmpi->assembled        = PETSC_FALSE;
569   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
570   PetscFunctionReturn(PETSC_SUCCESS);
571 }
572 
573 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
574 {
575   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
576   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
577   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
578   PetscInt    pcstart, pcend, column, offset;
579 
580   PetscFunctionBegin;
581   pcstart = P->cmap->rstart;
582   pcstart *= dof;
583   pcend = P->cmap->rend;
584   pcend *= dof;
585   /* diagonal portion: Ad[i,:]*P */
586   ai  = ad->i;
587   nzi = ai[i + 1] - ai[i];
588   aj  = ad->j + ai[i];
589   for (j = 0; j < nzi; j++) {
590     row    = aj[j];
591     offset = row % dof;
592     row /= dof;
593     nzpi = pd->i[row + 1] - pd->i[row];
594     pj   = pd->j + pd->i[row];
595     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
596   }
597   /* off-diagonal P */
598   for (j = 0; j < nzi; j++) {
599     row    = aj[j];
600     offset = row % dof;
601     row /= dof;
602     nzpi = po->i[row + 1] - po->i[row];
603     pj   = PetscSafePointerPlusOffset(po->j, po->i[row]);
604     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
605   }
606 
607   /* off-diagonal part: Ao[i, :]*P_oth */
608   if (ao) {
609     ai  = ao->i;
610     pi  = p_oth->i;
611     nzi = ai[i + 1] - ai[i];
612     aj  = ao->j + ai[i];
613     for (j = 0; j < nzi; j++) {
614       row       = aj[j];
615       offset    = a->garray[row] % dof;
616       row       = map[row];
617       pnz       = pi[row + 1] - pi[row];
618       p_othcols = p_oth->j + pi[row];
619       for (col = 0; col < pnz; col++) {
620         column = p_othcols[col] * dof + offset;
621         if (column >= pcstart && column < pcend) {
622           PetscCall(PetscHSetIAdd(dht, column));
623         } else {
624           PetscCall(PetscHSetIAdd(oht, column));
625         }
626       }
627     }
628   } /* end if (ao) */
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
633 {
634   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
635   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
636   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
637   PetscScalar ra, *aa, *pa;
638 
639   PetscFunctionBegin;
640   pcstart = P->cmap->rstart;
641   pcstart *= dof;
642 
643   /* diagonal portion: Ad[i,:]*P */
644   ai  = ad->i;
645   nzi = ai[i + 1] - ai[i];
646   aj  = ad->j + ai[i];
647   aa  = ad->a + ai[i];
648   for (j = 0; j < nzi; j++) {
649     ra     = aa[j];
650     row    = aj[j];
651     offset = row % dof;
652     row /= dof;
653     nzpi = pd->i[row + 1] - pd->i[row];
654     pj   = pd->j + pd->i[row];
655     pa   = pd->a + pd->i[row];
656     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
657     PetscCall(PetscLogFlops(2.0 * nzpi));
658   }
659   for (j = 0; j < nzi; j++) {
660     ra     = aa[j];
661     row    = aj[j];
662     offset = row % dof;
663     row /= dof;
664     nzpi = po->i[row + 1] - po->i[row];
665     pj   = PetscSafePointerPlusOffset(po->j, po->i[row]);
666     pa   = PetscSafePointerPlusOffset(po->a, po->i[row]);
667     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
668     PetscCall(PetscLogFlops(2.0 * nzpi));
669   }
670 
671   /* off-diagonal part: Ao[i, :]*P_oth */
672   if (ao) {
673     ai  = ao->i;
674     pi  = p_oth->i;
675     nzi = ai[i + 1] - ai[i];
676     aj  = ao->j + ai[i];
677     aa  = ao->a + ai[i];
678     for (j = 0; j < nzi; j++) {
679       row       = aj[j];
680       offset    = a->garray[row] % dof;
681       row       = map[row];
682       ra        = aa[j];
683       pnz       = pi[row + 1] - pi[row];
684       p_othcols = p_oth->j + pi[row];
685       pa        = p_oth->a + pi[row];
686       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
687       PetscCall(PetscLogFlops(2.0 * pnz));
688     }
689   } /* end if (ao) */
690   PetscFunctionReturn(PETSC_SUCCESS);
691 }
692 
693 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
694 
695 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
696 {
697   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
698   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
699   Mat_APMPI      *ptap;
700   PetscHMapIV     hmap;
701   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
702   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
703   PetscInt        offset, ii, pocol;
704   const PetscInt *mappingindices;
705   IS              map;
706 
707   PetscFunctionBegin;
708   MatCheckProduct(C, 4);
709   ptap = (Mat_APMPI *)C->product->data;
710   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
711   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
712 
713   PetscCall(MatZeroEntries(C));
714 
715   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
716   if (ptap->reuse == MAT_REUSE_MATRIX) {
717     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
718     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
719   }
720   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
721 
722   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
723   pon *= dof;
724   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
725   PetscCall(MatGetLocalSize(A, &am, NULL));
726   cmaxr = 0;
727   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
728   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
729   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
730   PetscCall(ISGetIndices(map, &mappingindices));
731   for (i = 0; i < am && pon; i++) {
732     PetscCall(PetscHMapIVClear(hmap));
733     offset = i % dof;
734     ii     = i / dof;
735     nzi    = po->i[ii + 1] - po->i[ii];
736     if (!nzi) continue;
737     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
738     voff = 0;
739     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
740     if (!voff) continue;
741 
742     /* Form C(ii, :) */
743     poj = po->j + po->i[ii];
744     poa = po->a + po->i[ii];
745     for (j = 0; j < nzi; j++) {
746       pocol   = poj[j] * dof + offset;
747       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
748       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
749       for (jj = 0; jj < voff; jj++) {
750         apvaluestmp[jj] = apvalues[jj] * poa[j];
751         /* If the row is empty */
752         if (!c_rmtc[pocol]) {
753           c_rmtjj[jj] = apindices[jj];
754           c_rmtaa[jj] = apvaluestmp[jj];
755           c_rmtc[pocol]++;
756         } else {
757           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
758           if (loc >= 0) { /* hit */
759             c_rmtaa[loc] += apvaluestmp[jj];
760             PetscCall(PetscLogFlops(1.0));
761           } else { /* new element */
762             loc = -(loc + 1);
763             /* Move data backward */
764             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
765               c_rmtjj[kk] = c_rmtjj[kk - 1];
766               c_rmtaa[kk] = c_rmtaa[kk - 1];
767             } /* End kk */
768             c_rmtjj[loc] = apindices[jj];
769             c_rmtaa[loc] = apvaluestmp[jj];
770             c_rmtc[pocol]++;
771           }
772         }
773         PetscCall(PetscLogFlops(voff));
774       } /* End jj */
775     } /* End j */
776   } /* End i */
777 
778   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
779   PetscCall(PetscHMapIVDestroy(&hmap));
780 
781   PetscCall(MatGetLocalSize(P, NULL, &pn));
782   pn *= dof;
783   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
784 
785   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
786   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
787   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
788   pcstart = pcstart * dof;
789   pcend   = pcend * dof;
790   cd      = (Mat_SeqAIJ *)c->A->data;
791   co      = (Mat_SeqAIJ *)c->B->data;
792 
793   cmaxr = 0;
794   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
795   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
796   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
797   for (i = 0; i < am && pn; i++) {
798     PetscCall(PetscHMapIVClear(hmap));
799     offset = i % dof;
800     ii     = i / dof;
801     nzi    = pd->i[ii + 1] - pd->i[ii];
802     if (!nzi) continue;
803     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
804     voff = 0;
805     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
806     if (!voff) continue;
807     /* Form C(ii, :) */
808     pdj = pd->j + pd->i[ii];
809     pda = pd->a + pd->i[ii];
810     for (j = 0; j < nzi; j++) {
811       row = pcstart + pdj[j] * dof + offset;
812       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
813       PetscCall(PetscLogFlops(voff));
814       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
815     }
816   }
817   PetscCall(ISRestoreIndices(map, &mappingindices));
818   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
819   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
820   PetscCall(PetscHMapIVDestroy(&hmap));
821   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
822   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
823   PetscCall(PetscFree2(c_rmtj, c_rmta));
824 
825   /* Add contributions from remote */
826   for (i = 0; i < pn; i++) {
827     row = i + pcstart;
828     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
829   }
830   PetscCall(PetscFree2(c_othj, c_otha));
831 
832   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
833   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
834 
835   ptap->reuse = MAT_REUSE_MATRIX;
836   PetscFunctionReturn(PETSC_SUCCESS);
837 }
838 
839 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
840 {
841   PetscFunctionBegin;
842   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
843   PetscFunctionReturn(PETSC_SUCCESS);
844 }
845 
846 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
847 {
848   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
849   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
850   Mat_APMPI      *ptap;
851   PetscHMapIV     hmap;
852   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
853   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
854   PetscInt        offset, ii, pocol;
855   const PetscInt *mappingindices;
856   IS              map;
857 
858   PetscFunctionBegin;
859   MatCheckProduct(C, 4);
860   ptap = (Mat_APMPI *)C->product->data;
861   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
862   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
863 
864   PetscCall(MatZeroEntries(C));
865 
866   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
867   if (ptap->reuse == MAT_REUSE_MATRIX) {
868     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
869     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
870   }
871   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
872   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
873   pon *= dof;
874   PetscCall(MatGetLocalSize(P, NULL, &pn));
875   pn *= dof;
876 
877   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
878   PetscCall(MatGetLocalSize(A, &am, NULL));
879   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
880   pcstart *= dof;
881   pcend *= dof;
882   cmaxr = 0;
883   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
884   cd = (Mat_SeqAIJ *)c->A->data;
885   co = (Mat_SeqAIJ *)c->B->data;
886   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
887   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
888   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
889   PetscCall(ISGetIndices(map, &mappingindices));
890   for (i = 0; i < am && (pon || pn); i++) {
891     PetscCall(PetscHMapIVClear(hmap));
892     offset = i % dof;
893     ii     = i / dof;
894     nzi    = po->i[ii + 1] - po->i[ii];
895     dnzi   = pd->i[ii + 1] - pd->i[ii];
896     if (!nzi && !dnzi) continue;
897     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
898     voff = 0;
899     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
900     if (!voff) continue;
901 
902     /* Form remote C(ii, :) */
903     poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
904     poa = PetscSafePointerPlusOffset(po->a, po->i[ii]);
905     for (j = 0; j < nzi; j++) {
906       pocol   = poj[j] * dof + offset;
907       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
908       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
909       for (jj = 0; jj < voff; jj++) {
910         apvaluestmp[jj] = apvalues[jj] * poa[j];
911         /* If the row is empty */
912         if (!c_rmtc[pocol]) {
913           c_rmtjj[jj] = apindices[jj];
914           c_rmtaa[jj] = apvaluestmp[jj];
915           c_rmtc[pocol]++;
916         } else {
917           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
918           if (loc >= 0) { /* hit */
919             c_rmtaa[loc] += apvaluestmp[jj];
920             PetscCall(PetscLogFlops(1.0));
921           } else { /* new element */
922             loc = -(loc + 1);
923             /* Move data backward */
924             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
925               c_rmtjj[kk] = c_rmtjj[kk - 1];
926               c_rmtaa[kk] = c_rmtaa[kk - 1];
927             } /* End kk */
928             c_rmtjj[loc] = apindices[jj];
929             c_rmtaa[loc] = apvaluestmp[jj];
930             c_rmtc[pocol]++;
931           }
932         }
933       } /* End jj */
934       PetscCall(PetscLogFlops(voff));
935     } /* End j */
936 
937     /* Form local C(ii, :) */
938     pdj = pd->j + pd->i[ii];
939     pda = pd->a + pd->i[ii];
940     for (j = 0; j < dnzi; j++) {
941       row = pcstart + pdj[j] * dof + offset;
942       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
943       PetscCall(PetscLogFlops(voff));
944       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
945     } /* End j */
946   } /* End i */
947 
948   PetscCall(ISRestoreIndices(map, &mappingindices));
949   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
950   PetscCall(PetscHMapIVDestroy(&hmap));
951   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
952 
953   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
954   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
955   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
956   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
957   PetscCall(PetscFree2(c_rmtj, c_rmta));
958 
959   /* Add contributions from remote */
960   for (i = 0; i < pn; i++) {
961     row = i + pcstart;
962     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES));
963   }
964   PetscCall(PetscFree2(c_othj, c_otha));
965 
966   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
967   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
968 
969   ptap->reuse = MAT_REUSE_MATRIX;
970   PetscFunctionReturn(PETSC_SUCCESS);
971 }
972 
973 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
974 {
975   PetscFunctionBegin;
976   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
977   PetscFunctionReturn(PETSC_SUCCESS);
978 }
979 
980 /* TODO: move algorithm selection to MatProductSetFromOptions */
981 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
982 {
983   Mat_APMPI      *ptap;
984   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
985   MPI_Comm        comm;
986   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
987   MatType         mtype;
988   PetscSF         sf;
989   PetscSFNode    *iremote;
990   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
991   const PetscInt *rootdegrees;
992   PetscHSetI      ht, oht, *hta, *hto;
993   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
994   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
995   PetscInt        nalg = 2, alg = 0, offset, ii;
996   PetscMPIInt     owner;
997   const PetscInt *mappingindices;
998   PetscBool       flg;
999   const char     *algTypes[2] = {"overlapping", "merged"};
1000   IS              map;
1001 
1002   PetscFunctionBegin;
1003   MatCheckProduct(Cmpi, 5);
1004   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1005   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1006 
1007   /* Create symbolic parallel matrix Cmpi */
1008   PetscCall(MatGetLocalSize(P, NULL, &pn));
1009   pn *= dof;
1010   PetscCall(MatGetType(A, &mtype));
1011   PetscCall(MatSetType(Cmpi, mtype));
1012   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1013 
1014   PetscCall(PetscNew(&ptap));
1015   ptap->reuse   = MAT_INITIAL_MATRIX;
1016   ptap->algType = 2;
1017 
1018   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1019   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1020   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1021   /* This equals to the number of offdiag columns in P */
1022   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1023   pon *= dof;
1024   /* offsets */
1025   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1026   /* The number of columns we will send to remote ranks */
1027   PetscCall(PetscMalloc1(pon, &c_rmtc));
1028   PetscCall(PetscMalloc1(pon, &hta));
1029   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1030   PetscCall(MatGetLocalSize(A, &am, NULL));
1031   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1032   /* Create hash table to merge all columns for C(i, :) */
1033   PetscCall(PetscHSetICreate(&ht));
1034 
1035   PetscCall(ISGetIndices(map, &mappingindices));
1036   ptap->c_rmti[0] = 0;
1037   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1038   for (i = 0; i < am && pon; i++) {
1039     /* Form one row of AP */
1040     PetscCall(PetscHSetIClear(ht));
1041     offset = i % dof;
1042     ii     = i / dof;
1043     /* If the off-diagonal is empty, do not do any calculation */
1044     nzi = po->i[ii + 1] - po->i[ii];
1045     if (!nzi) continue;
1046 
1047     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1048     PetscCall(PetscHSetIGetSize(ht, &htsize));
1049     /* If AP is empty, just continue */
1050     if (!htsize) continue;
1051     /* Form C(ii, :) */
1052     poj = po->j + po->i[ii];
1053     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1054   }
1055 
1056   for (i = 0; i < pon; i++) {
1057     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1058     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1059     c_rmtc[i]           = htsize;
1060   }
1061 
1062   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1063 
1064   for (i = 0; i < pon; i++) {
1065     off = 0;
1066     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1067     PetscCall(PetscHSetIDestroy(&hta[i]));
1068   }
1069   PetscCall(PetscFree(hta));
1070 
1071   PetscCall(PetscMalloc1(pon, &iremote));
1072   for (i = 0; i < pon; i++) {
1073     owner  = 0;
1074     lidx   = 0;
1075     offset = i % dof;
1076     ii     = i / dof;
1077     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1078     iremote[i].index = lidx * dof + offset;
1079     iremote[i].rank  = owner;
1080   }
1081 
1082   PetscCall(PetscSFCreate(comm, &sf));
1083   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1084   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1085   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1086   PetscCall(PetscSFSetFromOptions(sf));
1087   PetscCall(PetscSFSetUp(sf));
1088   /* How many neighbors have contributions to my rows? */
1089   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1090   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1091   rootspacesize = 0;
1092   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1093   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1094   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1095   /* Get information from leaves
1096    * Number of columns other people contribute to my rows
1097    * */
1098   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1099   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1100   PetscCall(PetscFree(c_rmtc));
1101   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1102   /* The number of columns is received for each row */
1103   ptap->c_othi[0]     = 0;
1104   rootspacesize       = 0;
1105   rootspaceoffsets[0] = 0;
1106   for (i = 0; i < pn; i++) {
1107     rcvncols = 0;
1108     for (j = 0; j < rootdegrees[i]; j++) {
1109       rcvncols += rootspace[rootspacesize];
1110       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1111       rootspacesize++;
1112     }
1113     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1114   }
1115   PetscCall(PetscFree(rootspace));
1116 
1117   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1118   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1119   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1120   PetscCall(PetscSFDestroy(&sf));
1121   PetscCall(PetscFree(rootspaceoffsets));
1122 
1123   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1124   nleaves = 0;
1125   for (i = 0; i < pon; i++) {
1126     owner = 0;
1127     ii    = i / dof;
1128     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1129     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1130     for (j = 0; j < sendncols; j++) {
1131       iremote[nleaves].rank    = owner;
1132       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1133     }
1134   }
1135   PetscCall(PetscFree(c_rmtoffsets));
1136   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1137 
1138   PetscCall(PetscSFCreate(comm, &ptap->sf));
1139   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1140   PetscCall(PetscSFSetFromOptions(ptap->sf));
1141   /* One to one map */
1142   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1143 
1144   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1145   PetscCall(PetscHSetICreate(&oht));
1146   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1147   pcstart *= dof;
1148   pcend *= dof;
1149   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1150   for (i = 0; i < pn; i++) {
1151     PetscCall(PetscHSetICreate(&hta[i]));
1152     PetscCall(PetscHSetICreate(&hto[i]));
1153   }
1154   /* Work on local part */
1155   /* 4) Pass 1: Estimate memory for C_loc */
1156   for (i = 0; i < am && pn; i++) {
1157     PetscCall(PetscHSetIClear(ht));
1158     PetscCall(PetscHSetIClear(oht));
1159     offset = i % dof;
1160     ii     = i / dof;
1161     nzi    = pd->i[ii + 1] - pd->i[ii];
1162     if (!nzi) continue;
1163 
1164     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1165     PetscCall(PetscHSetIGetSize(ht, &htsize));
1166     PetscCall(PetscHSetIGetSize(oht, &htosize));
1167     if (!(htsize + htosize)) continue;
1168     /* Form C(ii, :) */
1169     pdj = pd->j + pd->i[ii];
1170     for (j = 0; j < nzi; j++) {
1171       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1172       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1173     }
1174   }
1175 
1176   PetscCall(ISRestoreIndices(map, &mappingindices));
1177 
1178   PetscCall(PetscHSetIDestroy(&ht));
1179   PetscCall(PetscHSetIDestroy(&oht));
1180 
1181   /* Get remote data */
1182   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1183   PetscCall(PetscFree(c_rmtj));
1184 
1185   for (i = 0; i < pn; i++) {
1186     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1187     rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1188     for (j = 0; j < nzi; j++) {
1189       col = rdj[j];
1190       /* diag part */
1191       if (col >= pcstart && col < pcend) {
1192         PetscCall(PetscHSetIAdd(hta[i], col));
1193       } else { /* off-diagonal */
1194         PetscCall(PetscHSetIAdd(hto[i], col));
1195       }
1196     }
1197     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1198     dnz[i] = htsize;
1199     PetscCall(PetscHSetIDestroy(&hta[i]));
1200     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1201     onz[i] = htsize;
1202     PetscCall(PetscHSetIDestroy(&hto[i]));
1203   }
1204 
1205   PetscCall(PetscFree2(hta, hto));
1206   PetscCall(PetscFree(c_othj));
1207 
1208   /* local sizes and preallocation */
1209   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1210   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1211   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1212   PetscCall(MatSetUp(Cmpi));
1213   PetscCall(PetscFree2(dnz, onz));
1214 
1215   /* attach the supporting struct to Cmpi for reuse */
1216   Cmpi->product->data    = ptap;
1217   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1218   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1219 
1220   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1221   Cmpi->assembled = PETSC_FALSE;
1222   /* pick an algorithm */
1223   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1224   alg = 0;
1225   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1226   PetscOptionsEnd();
1227   switch (alg) {
1228   case 0:
1229     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1230     break;
1231   case 1:
1232     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1233     break;
1234   default:
1235     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1236   }
1237   PetscFunctionReturn(PETSC_SUCCESS);
1238 }
1239 
1240 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1241 {
1242   PetscFunctionBegin;
1243   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1244   PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246 
1247 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1248 {
1249   Mat_APMPI      *ptap;
1250   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1251   MPI_Comm        comm;
1252   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1253   MatType         mtype;
1254   PetscSF         sf;
1255   PetscSFNode    *iremote;
1256   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1257   const PetscInt *rootdegrees;
1258   PetscHSetI      ht, oht, *hta, *hto, *htd;
1259   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1260   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1261   PetscInt        nalg = 2, alg = 0, offset, ii;
1262   PetscMPIInt     owner;
1263   PetscBool       flg;
1264   const char     *algTypes[2] = {"merged", "overlapping"};
1265   const PetscInt *mappingindices;
1266   IS              map;
1267 
1268   PetscFunctionBegin;
1269   MatCheckProduct(Cmpi, 5);
1270   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1271   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1272 
1273   /* Create symbolic parallel matrix Cmpi */
1274   PetscCall(MatGetLocalSize(P, NULL, &pn));
1275   pn *= dof;
1276   PetscCall(MatGetType(A, &mtype));
1277   PetscCall(MatSetType(Cmpi, mtype));
1278   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1279 
1280   PetscCall(PetscNew(&ptap));
1281   ptap->reuse   = MAT_INITIAL_MATRIX;
1282   ptap->algType = 3;
1283 
1284   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1285   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1286   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1287 
1288   /* This equals to the number of offdiag columns in P */
1289   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1290   pon *= dof;
1291   /* offsets */
1292   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1293   /* The number of columns we will send to remote ranks */
1294   PetscCall(PetscMalloc1(pon, &c_rmtc));
1295   PetscCall(PetscMalloc1(pon, &hta));
1296   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1297   PetscCall(MatGetLocalSize(A, &am, NULL));
1298   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1299   /* Create hash table to merge all columns for C(i, :) */
1300   PetscCall(PetscHSetICreate(&ht));
1301   PetscCall(PetscHSetICreate(&oht));
1302   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1303   for (i = 0; i < pn; i++) {
1304     PetscCall(PetscHSetICreate(&htd[i]));
1305     PetscCall(PetscHSetICreate(&hto[i]));
1306   }
1307 
1308   PetscCall(ISGetIndices(map, &mappingindices));
1309   ptap->c_rmti[0] = 0;
1310   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1311   for (i = 0; i < am && (pon || pn); i++) {
1312     /* Form one row of AP */
1313     PetscCall(PetscHSetIClear(ht));
1314     PetscCall(PetscHSetIClear(oht));
1315     offset = i % dof;
1316     ii     = i / dof;
1317     /* If the off-diagonal is empty, do not do any calculation */
1318     nzi  = po->i[ii + 1] - po->i[ii];
1319     dnzi = pd->i[ii + 1] - pd->i[ii];
1320     if (!nzi && !dnzi) continue;
1321 
1322     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1323     PetscCall(PetscHSetIGetSize(ht, &htsize));
1324     PetscCall(PetscHSetIGetSize(oht, &htosize));
1325     /* If AP is empty, just continue */
1326     if (!(htsize + htosize)) continue;
1327 
1328     /* Form remote C(ii, :) */
1329     poj = PetscSafePointerPlusOffset(po->j, po->i[ii]);
1330     for (j = 0; j < nzi; j++) {
1331       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1332       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1333     }
1334 
1335     /* Form local C(ii, :) */
1336     pdj = pd->j + pd->i[ii];
1337     for (j = 0; j < dnzi; j++) {
1338       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1339       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1340     }
1341   }
1342 
1343   PetscCall(ISRestoreIndices(map, &mappingindices));
1344 
1345   PetscCall(PetscHSetIDestroy(&ht));
1346   PetscCall(PetscHSetIDestroy(&oht));
1347 
1348   for (i = 0; i < pon; i++) {
1349     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1350     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1351     c_rmtc[i]           = htsize;
1352   }
1353 
1354   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1355 
1356   for (i = 0; i < pon; i++) {
1357     off = 0;
1358     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1359     PetscCall(PetscHSetIDestroy(&hta[i]));
1360   }
1361   PetscCall(PetscFree(hta));
1362 
1363   PetscCall(PetscMalloc1(pon, &iremote));
1364   for (i = 0; i < pon; i++) {
1365     owner  = 0;
1366     lidx   = 0;
1367     offset = i % dof;
1368     ii     = i / dof;
1369     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1370     iremote[i].index = lidx * dof + offset;
1371     iremote[i].rank  = owner;
1372   }
1373 
1374   PetscCall(PetscSFCreate(comm, &sf));
1375   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1376   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1377   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1378   PetscCall(PetscSFSetFromOptions(sf));
1379   PetscCall(PetscSFSetUp(sf));
1380   /* How many neighbors have contributions to my rows? */
1381   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1382   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1383   rootspacesize = 0;
1384   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1385   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1386   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1387   /* Get information from leaves
1388    * Number of columns other people contribute to my rows
1389    * */
1390   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1391   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1392   PetscCall(PetscFree(c_rmtc));
1393   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1394   /* The number of columns is received for each row */
1395   ptap->c_othi[0]     = 0;
1396   rootspacesize       = 0;
1397   rootspaceoffsets[0] = 0;
1398   for (i = 0; i < pn; i++) {
1399     rcvncols = 0;
1400     for (j = 0; j < rootdegrees[i]; j++) {
1401       rcvncols += rootspace[rootspacesize];
1402       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1403       rootspacesize++;
1404     }
1405     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1406   }
1407   PetscCall(PetscFree(rootspace));
1408 
1409   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1410   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1411   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1412   PetscCall(PetscSFDestroy(&sf));
1413   PetscCall(PetscFree(rootspaceoffsets));
1414 
1415   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1416   nleaves = 0;
1417   for (i = 0; i < pon; i++) {
1418     owner = 0;
1419     ii    = i / dof;
1420     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1421     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1422     for (j = 0; j < sendncols; j++) {
1423       iremote[nleaves].rank    = owner;
1424       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1425     }
1426   }
1427   PetscCall(PetscFree(c_rmtoffsets));
1428   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1429 
1430   PetscCall(PetscSFCreate(comm, &ptap->sf));
1431   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1432   PetscCall(PetscSFSetFromOptions(ptap->sf));
1433   /* One to one map */
1434   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1435   /* Get remote data */
1436   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1437   PetscCall(PetscFree(c_rmtj));
1438   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1439   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1440   pcstart *= dof;
1441   pcend *= dof;
1442   for (i = 0; i < pn; i++) {
1443     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1444     rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]);
1445     for (j = 0; j < nzi; j++) {
1446       col = rdj[j];
1447       /* diagonal part */
1448       if (col >= pcstart && col < pcend) {
1449         PetscCall(PetscHSetIAdd(htd[i], col));
1450       } else { /* off-diagonal */
1451         PetscCall(PetscHSetIAdd(hto[i], col));
1452       }
1453     }
1454     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1455     dnz[i] = htsize;
1456     PetscCall(PetscHSetIDestroy(&htd[i]));
1457     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1458     onz[i] = htsize;
1459     PetscCall(PetscHSetIDestroy(&hto[i]));
1460   }
1461 
1462   PetscCall(PetscFree2(htd, hto));
1463   PetscCall(PetscFree(c_othj));
1464 
1465   /* local sizes and preallocation */
1466   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1467   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1468   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1469   PetscCall(PetscFree2(dnz, onz));
1470 
1471   /* attach the supporting struct to Cmpi for reuse */
1472   Cmpi->product->data    = ptap;
1473   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1474   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1475 
1476   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1477   Cmpi->assembled = PETSC_FALSE;
1478   /* pick an algorithm */
1479   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1480   alg = 0;
1481   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1482   PetscOptionsEnd();
1483   switch (alg) {
1484   case 0:
1485     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1486     break;
1487   case 1:
1488     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1489     break;
1490   default:
1491     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1492   }
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1497 {
1498   PetscFunctionBegin;
1499   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1500   PetscFunctionReturn(PETSC_SUCCESS);
1501 }
1502 
1503 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1504 {
1505   Mat_APMPI               *ptap;
1506   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1507   MPI_Comm                 comm;
1508   PetscMPIInt              size, rank, nsend, proc;
1509   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1510   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1511   PetscInt                *lnk, i, k, pnz, row;
1512   PetscBT                  lnkbt;
1513   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1514   PETSC_UNUSED PetscMPIInt icompleted = 0;
1515   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1516   PetscInt                 len, *dnz, *onz, *owners, nzi, nspacedouble;
1517   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1518   MPI_Request             *swaits, *rwaits;
1519   MPI_Status              *sstatus, rstatus;
1520   PetscLayout              rowmap;
1521   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1522   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1523   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1524   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth;
1525   PetscScalar             *apv;
1526   PetscHMapI               ta;
1527   MatType                  mtype;
1528   const char              *prefix;
1529 #if defined(PETSC_USE_INFO)
1530   PetscReal apfill;
1531 #endif
1532 
1533   PetscFunctionBegin;
1534   MatCheckProduct(Cmpi, 4);
1535   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1536   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1537   PetscCallMPI(MPI_Comm_size(comm, &size));
1538   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1539 
1540   if (size > 1) ao = (Mat_SeqAIJ *)a->B->data;
1541 
1542   /* create symbolic parallel matrix Cmpi */
1543   PetscCall(MatGetType(A, &mtype));
1544   PetscCall(MatSetType(Cmpi, mtype));
1545 
1546   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1547   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1548 
1549   /* create struct Mat_APMPI and attached it to C later */
1550   PetscCall(PetscNew(&ptap));
1551   ptap->reuse   = MAT_INITIAL_MATRIX;
1552   ptap->algType = 1;
1553 
1554   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1555   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1556   /* get P_loc by taking all local rows of P */
1557   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1558 
1559   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1560   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1561   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1562 
1563   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1564   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1565   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
1566 
1567   /* create and initialize a linked list */
1568   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1569   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1570   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1571   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1572   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1573 
1574   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1575 
1576   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1577   if (ao) {
1578     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1579   } else {
1580     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1581   }
1582   current_space = free_space;
1583   nspacedouble  = 0;
1584 
1585   PetscCall(PetscMalloc1(am + 1, &api));
1586   api[0] = 0;
1587   for (i = 0; i < am; i++) {
1588     /* diagonal portion: Ad[i,:]*P */
1589     ai  = ad->i;
1590     pi  = p_loc->i;
1591     nzi = ai[i + 1] - ai[i];
1592     aj  = PetscSafePointerPlusOffset(ad->j, ai[i]);
1593     for (j = 0; j < nzi; j++) {
1594       row  = aj[j];
1595       pnz  = pi[row + 1] - pi[row];
1596       Jptr = p_loc->j + pi[row];
1597       /* add non-zero cols of P into the sorted linked list lnk */
1598       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1599     }
1600     /* off-diagonal portion: Ao[i,:]*P */
1601     if (ao) {
1602       ai  = ao->i;
1603       pi  = p_oth->i;
1604       nzi = ai[i + 1] - ai[i];
1605       aj  = PetscSafePointerPlusOffset(ao->j, ai[i]);
1606       for (j = 0; j < nzi; j++) {
1607         row  = aj[j];
1608         pnz  = pi[row + 1] - pi[row];
1609         Jptr = p_oth->j + pi[row];
1610         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1611       }
1612     }
1613     apnz       = lnk[0];
1614     api[i + 1] = api[i] + apnz;
1615     if (ap_rmax < apnz) ap_rmax = apnz;
1616 
1617     /* if free space is not available, double the total space in the list */
1618     if (current_space->local_remaining < apnz) {
1619       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1620       nspacedouble++;
1621     }
1622 
1623     /* Copy data into free space, then initialize lnk */
1624     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1625 
1626     current_space->array = PetscSafePointerPlusOffset(current_space->array, apnz);
1627     current_space->local_used += apnz;
1628     current_space->local_remaining -= apnz;
1629   }
1630   /* Allocate space for apj and apv, initialize apj, and */
1631   /* destroy list of free space and other temporary array(s) */
1632   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1633   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1634   PetscCall(PetscLLDestroy(lnk, lnkbt));
1635 
1636   /* Create AP_loc for reuse */
1637   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1638   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1639 #if defined(PETSC_USE_INFO)
1640   if (ao) {
1641     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1642   } else {
1643     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1644   }
1645   ptap->AP_loc->info.mallocs           = nspacedouble;
1646   ptap->AP_loc->info.fill_ratio_given  = fill;
1647   ptap->AP_loc->info.fill_ratio_needed = apfill;
1648 
1649   if (api[am]) {
1650     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1651     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1652   } else {
1653     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1654   }
1655 #endif
1656 
1657   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1658   PetscCall(MatGetOptionsPrefix(A, &prefix));
1659   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1660   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1661   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1662   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1663   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1664   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1665   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1666   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1667   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1668   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1669   PetscCall(MatProductSymbolic(ptap->C_oth));
1670 
1671   /* (3) send coj of C_oth to other processors  */
1672   /* determine row ownership */
1673   PetscCall(PetscLayoutCreate(comm, &rowmap));
1674   rowmap->n  = pn;
1675   rowmap->bs = 1;
1676   PetscCall(PetscLayoutSetUp(rowmap));
1677   owners = rowmap->range;
1678 
1679   /* determine the number of messages to send, their lengths */
1680   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1681   PetscCall(PetscArrayzero(len_s, size));
1682   PetscCall(PetscArrayzero(len_si, size));
1683 
1684   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1685   coi   = c_oth->i;
1686   coj   = c_oth->j;
1687   con   = ptap->C_oth->rmap->n;
1688   proc  = 0;
1689   for (i = 0; i < con; i++) {
1690     while (prmap[i] >= owners[proc + 1]) proc++;
1691     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1692     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1693   }
1694 
1695   len          = 0; /* max length of buf_si[], see (4) */
1696   owners_co[0] = 0;
1697   nsend        = 0;
1698   for (proc = 0; proc < size; proc++) {
1699     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1700     if (len_s[proc]) {
1701       nsend++;
1702       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1703       len += len_si[proc];
1704     }
1705   }
1706 
1707   /* determine the number and length of messages to receive for coi and coj  */
1708   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1709   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1710 
1711   /* post the Irecv and Isend of coj */
1712   PetscCall(PetscCommGetNewTag(comm, &tagj));
1713   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1714   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1715   for (proc = 0, k = 0; proc < size; proc++) {
1716     if (!len_s[proc]) continue;
1717     i = owners_co[proc];
1718     PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1719     k++;
1720   }
1721 
1722   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1723   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1724   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1725   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1726   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1727   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1728   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1729   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1730   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1731   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1732   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1733   PetscCall(MatProductSymbolic(ptap->C_loc));
1734 
1735   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1736 
1737   /* receives coj are complete */
1738   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1739   PetscCall(PetscFree(rwaits));
1740   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1741 
1742   /* add received column indices into ta to update Crmax */
1743   for (k = 0; k < nrecv; k++) { /* k-th received message */
1744     Jptr = buf_rj[k];
1745     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1746   }
1747   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1748   PetscCall(PetscHMapIDestroy(&ta));
1749 
1750   /* (4) send and recv coi */
1751   PetscCall(PetscCommGetNewTag(comm, &tagi));
1752   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1753   PetscCall(PetscMalloc1(len + 1, &buf_s));
1754   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1755   for (proc = 0, k = 0; proc < size; proc++) {
1756     if (!len_s[proc]) continue;
1757     /* form outgoing message for i-structure:
1758          buf_si[0]:                 nrows to be sent
1759                [1:nrows]:           row index (global)
1760                [nrows+1:2*nrows+1]: i-structure index
1761     */
1762     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1763     buf_si_i    = buf_si + nrows + 1;
1764     buf_si[0]   = nrows;
1765     buf_si_i[0] = 0;
1766     nrows       = 0;
1767     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1768       nzi                 = coi[i + 1] - coi[i];
1769       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1770       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1771       nrows++;
1772     }
1773     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1774     k++;
1775     buf_si += len_si[proc];
1776   }
1777   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1778   PetscCall(PetscFree(rwaits));
1779   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1780 
1781   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1782   PetscCall(PetscFree(len_ri));
1783   PetscCall(PetscFree(swaits));
1784   PetscCall(PetscFree(buf_s));
1785 
1786   /* (5) compute the local portion of Cmpi      */
1787   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
1788   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1789   current_space = free_space;
1790 
1791   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1792   for (k = 0; k < nrecv; k++) {
1793     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1794     nrows       = *buf_ri_k[k];
1795     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1796     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1797   }
1798 
1799   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1800   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1801   for (i = 0; i < pn; i++) {
1802     /* add C_loc into Cmpi */
1803     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1804     Jptr = PetscSafePointerPlusOffset(c_loc->j, c_loc->i[i]);
1805     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1806 
1807     /* add received col data into lnk */
1808     for (k = 0; k < nrecv; k++) { /* k-th received message */
1809       if (i == *nextrow[k]) {     /* i-th row */
1810         nzi  = *(nextci[k] + 1) - *nextci[k];
1811         Jptr = buf_rj[k] + *nextci[k];
1812         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1813         nextrow[k]++;
1814         nextci[k]++;
1815       }
1816     }
1817     nzi = lnk[0];
1818 
1819     /* copy data into free space, then initialize lnk */
1820     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1821     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1822   }
1823   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1824   PetscCall(PetscLLDestroy(lnk, lnkbt));
1825   PetscCall(PetscFreeSpaceDestroy(free_space));
1826 
1827   /* local sizes and preallocation */
1828   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1829   if (P->cmap->bs > 0) {
1830     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1831     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1832   }
1833   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1834   MatPreallocateEnd(dnz, onz);
1835 
1836   /* members in merge */
1837   PetscCall(PetscFree(id_r));
1838   PetscCall(PetscFree(len_r));
1839   PetscCall(PetscFree(buf_ri[0]));
1840   PetscCall(PetscFree(buf_ri));
1841   PetscCall(PetscFree(buf_rj[0]));
1842   PetscCall(PetscFree(buf_rj));
1843   PetscCall(PetscLayoutDestroy(&rowmap));
1844 
1845   PetscCall(PetscCalloc1(pN, &ptap->apa));
1846 
1847   /* attach the supporting struct to Cmpi for reuse */
1848   Cmpi->product->data    = ptap;
1849   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1850   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1851 
1852   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1853   Cmpi->assembled = PETSC_FALSE;
1854   PetscFunctionReturn(PETSC_SUCCESS);
1855 }
1856 
1857 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1858 {
1859   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1860   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data;
1861   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1862   Mat_APMPI         *ptap;
1863   Mat                AP_loc, C_loc, C_oth;
1864   PetscInt           i, rstart, rend, cm, ncols, row;
1865   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1866   PetscScalar       *apa;
1867   const PetscInt    *cols;
1868   const PetscScalar *vals;
1869 
1870   PetscFunctionBegin;
1871   MatCheckProduct(C, 3);
1872   ptap = (Mat_APMPI *)C->product->data;
1873   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1874   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1875 
1876   PetscCall(MatZeroEntries(C));
1877   /* 1) get R = Pd^T,Ro = Po^T */
1878   if (ptap->reuse == MAT_REUSE_MATRIX) {
1879     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1880     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1881   }
1882 
1883   /* 2) get AP_loc */
1884   AP_loc = ptap->AP_loc;
1885   ap     = (Mat_SeqAIJ *)AP_loc->data;
1886 
1887   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1888   if (ptap->reuse == MAT_REUSE_MATRIX) {
1889     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1890     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1891     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1892   }
1893 
1894   /* 2-2) compute numeric A_loc*P - dominating part */
1895   /* get data from symbolic products */
1896   p_loc = (Mat_SeqAIJ *)ptap->P_loc->data;
1897   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data;
1898   apa = ptap->apa;
1899   api = ap->i;
1900   apj = ap->j;
1901   for (i = 0; i < am; i++) {
1902     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1903     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1904     apnz = api[i + 1] - api[i];
1905     for (j = 0; j < apnz; j++) {
1906       col                 = apj[j + api[i]];
1907       ap->a[j + ap->i[i]] = apa[col];
1908       apa[col]            = 0.0;
1909     }
1910   }
1911   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1912   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1913 
1914   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1915   PetscCall(MatProductNumeric(ptap->C_loc));
1916   PetscCall(MatProductNumeric(ptap->C_oth));
1917   C_loc = ptap->C_loc;
1918   C_oth = ptap->C_oth;
1919 
1920   /* add C_loc and Co to C */
1921   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1922 
1923   /* C_loc -> C */
1924   cm    = C_loc->rmap->N;
1925   c_seq = (Mat_SeqAIJ *)C_loc->data;
1926   cols  = c_seq->j;
1927   vals  = c_seq->a;
1928 
1929   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1930   /* when there are no off-processor parts.  */
1931   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1932   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1933   /* a table, and other, more complex stuff has to be done. */
1934   if (C->assembled) {
1935     C->was_assembled = PETSC_TRUE;
1936     C->assembled     = PETSC_FALSE;
1937   }
1938   if (C->was_assembled) {
1939     for (i = 0; i < cm; i++) {
1940       ncols = c_seq->i[i + 1] - c_seq->i[i];
1941       row   = rstart + i;
1942       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1943       cols = PetscSafePointerPlusOffset(cols, ncols);
1944       vals += ncols;
1945     }
1946   } else {
1947     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1948   }
1949 
1950   /* Co -> C, off-processor part */
1951   cm    = C_oth->rmap->N;
1952   c_seq = (Mat_SeqAIJ *)C_oth->data;
1953   cols  = c_seq->j;
1954   vals  = c_seq->a;
1955   for (i = 0; i < cm; i++) {
1956     ncols = c_seq->i[i + 1] - c_seq->i[i];
1957     row   = p->garray[i];
1958     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1959     cols += ncols;
1960     vals += ncols;
1961   }
1962 
1963   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1964   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1965 
1966   ptap->reuse = MAT_REUSE_MATRIX;
1967   PetscFunctionReturn(PETSC_SUCCESS);
1968 }
1969 
1970 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1971 {
1972   Mat_Product        *product = C->product;
1973   Mat                 A = product->A, P = product->B;
1974   MatProductAlgorithm alg  = product->alg;
1975   PetscReal           fill = product->fill;
1976   PetscBool           flg;
1977 
1978   PetscFunctionBegin;
1979   /* scalable: do R=P^T locally, then C=R*A*P */
1980   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1981   if (flg) {
1982     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1983     C->ops->productnumeric = MatProductNumeric_PtAP;
1984     goto next;
1985   }
1986 
1987   /* nonscalable: do R=P^T locally, then C=R*A*P */
1988   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1989   if (flg) {
1990     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1991     goto next;
1992   }
1993 
1994   /* allatonce */
1995   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
1996   if (flg) {
1997     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
1998     goto next;
1999   }
2000 
2001   /* allatonce_merged */
2002   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2003   if (flg) {
2004     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2005     goto next;
2006   }
2007 
2008   /* backend general code */
2009   PetscCall(PetscStrcmp(alg, "backend", &flg));
2010   if (flg) {
2011     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2012     PetscFunctionReturn(PETSC_SUCCESS);
2013   }
2014 
2015   /* hypre */
2016 #if defined(PETSC_HAVE_HYPRE)
2017   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2018   if (flg) {
2019     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2020     C->ops->productnumeric = MatProductNumeric_PtAP;
2021     PetscFunctionReturn(PETSC_SUCCESS);
2022   }
2023 #endif
2024   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2025 
2026 next:
2027   C->ops->productnumeric = MatProductNumeric_PtAP;
2028   PetscFunctionReturn(PETSC_SUCCESS);
2029 }
2030