xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 21e3ffae2f3b73c0bd738cf6d0a809700fc04bb0)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17 {
18   PetscBool         iascii;
19   PetscViewerFormat format;
20   Mat_APMPI        *ptap;
21 
22   PetscFunctionBegin;
23   MatCheckProduct(A, 1);
24   ptap = (Mat_APMPI *)A->product->data;
25   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26   if (iascii) {
27     PetscCall(PetscViewerGetFormat(viewer, &format));
28     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29       if (ptap->algType == 0) {
30         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31       } else if (ptap->algType == 1) {
32         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
33       } else if (ptap->algType == 2) {
34         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
35       } else if (ptap->algType == 3) {
36         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37       }
38     }
39   }
40   PetscFunctionReturn(PETSC_SUCCESS);
41 }
42 
43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44 {
45   Mat_APMPI           *ptap = (Mat_APMPI *)data;
46   Mat_Merge_SeqsToMPI *merge;
47 
48   PetscFunctionBegin;
49   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
50   PetscCall(PetscFree(ptap->bufa));
51   PetscCall(MatDestroy(&ptap->P_loc));
52   PetscCall(MatDestroy(&ptap->P_oth));
53   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
54   PetscCall(MatDestroy(&ptap->Rd));
55   PetscCall(MatDestroy(&ptap->Ro));
56   if (ptap->AP_loc) { /* used by alg_rap */
57     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
58     PetscCall(PetscFree(ap->i));
59     PetscCall(PetscFree2(ap->j, ap->a));
60     PetscCall(MatDestroy(&ptap->AP_loc));
61   } else { /* used by alg_ptap */
62     PetscCall(PetscFree(ptap->api));
63     PetscCall(PetscFree(ptap->apj));
64   }
65   PetscCall(MatDestroy(&ptap->C_loc));
66   PetscCall(MatDestroy(&ptap->C_oth));
67   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68 
69   PetscCall(MatDestroy(&ptap->Pt));
70 
71   merge = ptap->merge;
72   if (merge) { /* used by alg_ptap */
73     PetscCall(PetscFree(merge->id_r));
74     PetscCall(PetscFree(merge->len_s));
75     PetscCall(PetscFree(merge->len_r));
76     PetscCall(PetscFree(merge->bi));
77     PetscCall(PetscFree(merge->bj));
78     PetscCall(PetscFree(merge->buf_ri[0]));
79     PetscCall(PetscFree(merge->buf_ri));
80     PetscCall(PetscFree(merge->buf_rj[0]));
81     PetscCall(PetscFree(merge->buf_rj));
82     PetscCall(PetscFree(merge->coi));
83     PetscCall(PetscFree(merge->coj));
84     PetscCall(PetscFree(merge->owners_co));
85     PetscCall(PetscLayoutDestroy(&merge->rowmap));
86     PetscCall(PetscFree(ptap->merge));
87   }
88   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
89 
90   PetscCall(PetscSFDestroy(&ptap->sf));
91   PetscCall(PetscFree(ptap->c_othi));
92   PetscCall(PetscFree(ptap->c_rmti));
93   PetscCall(PetscFree(ptap));
94   PetscFunctionReturn(PETSC_SUCCESS);
95 }
96 
97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98 {
99   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
101   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
102   Mat_APMPI         *ptap;
103   Mat                AP_loc, C_loc, C_oth;
104   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105   PetscScalar       *apa;
106   const PetscInt    *cols;
107   const PetscScalar *vals;
108 
109   PetscFunctionBegin;
110   MatCheckProduct(C, 3);
111   ptap = (Mat_APMPI *)C->product->data;
112   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
113   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
114 
115   PetscCall(MatZeroEntries(C));
116 
117   /* 1) get R = Pd^T,Ro = Po^T */
118   if (ptap->reuse == MAT_REUSE_MATRIX) {
119     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
120     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121   }
122 
123   /* 2) get AP_loc */
124   AP_loc = ptap->AP_loc;
125   ap     = (Mat_SeqAIJ *)AP_loc->data;
126 
127   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128   /*-----------------------------------------------------*/
129   if (ptap->reuse == MAT_REUSE_MATRIX) {
130     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
131     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
132     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
133   }
134 
135   /* 2-2) compute numeric A_loc*P - dominating part */
136   /* ---------------------------------------------- */
137   /* get data from symbolic products */
138   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
139   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
140 
141   api = ap->i;
142   apj = ap->j;
143   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
144   for (i = 0; i < am; i++) {
145     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146     apnz = api[i + 1] - api[i];
147     apa  = ap->a + api[i];
148     PetscCall(PetscArrayzero(apa, apnz));
149     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
150   }
151   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
152   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
153 
154   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155   /* Always use scalable version since we are in the MPI scalable version */
156   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
157   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
158 
159   C_loc = ptap->C_loc;
160   C_oth = ptap->C_oth;
161 
162   /* add C_loc and Co to to C */
163   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
164 
165   /* C_loc -> C */
166   cm    = C_loc->rmap->N;
167   c_seq = (Mat_SeqAIJ *)C_loc->data;
168   cols  = c_seq->j;
169   vals  = c_seq->a;
170   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
171 
172   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173   /* when there are no off-processor parts.  */
174   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
175   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
176   /* a table, and other, more complex stuff has to be done. */
177   if (C->assembled) {
178     C->was_assembled = PETSC_TRUE;
179     C->assembled     = PETSC_FALSE;
180   }
181   if (C->was_assembled) {
182     for (i = 0; i < cm; i++) {
183       ncols = c_seq->i[i + 1] - c_seq->i[i];
184       row   = rstart + i;
185       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
186       cols += ncols;
187       vals += ncols;
188     }
189   } else {
190     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
191   }
192   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
193   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
194 
195   /* Co -> C, off-processor part */
196   cm    = C_oth->rmap->N;
197   c_seq = (Mat_SeqAIJ *)C_oth->data;
198   cols  = c_seq->j;
199   vals  = c_seq->a;
200   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
201   for (i = 0; i < cm; i++) {
202     ncols = c_seq->i[i + 1] - c_seq->i[i];
203     row   = p->garray[i];
204     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
205     cols += ncols;
206     vals += ncols;
207   }
208   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
209   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
210 
211   ptap->reuse = MAT_REUSE_MATRIX;
212 
213   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
214   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
219 {
220   Mat_APMPI               *ptap;
221   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
222   MPI_Comm                 comm;
223   PetscMPIInt              size, rank;
224   Mat                      P_loc, P_oth;
225   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
226   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
227   PetscInt                *lnk, i, k, pnz, row, nsend;
228   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
229   PETSC_UNUSED PetscMPIInt icompleted = 0;
230   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
231   const PetscInt          *owners;
232   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
233   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
234   MPI_Request             *swaits, *rwaits;
235   MPI_Status              *sstatus, rstatus;
236   PetscLayout              rowmap;
237   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
238   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
239   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
240   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
241   PetscScalar             *apv;
242   PetscHMapI               ta;
243   MatType                  mtype;
244   const char              *prefix;
245 #if defined(PETSC_USE_INFO)
246   PetscReal apfill;
247 #endif
248 
249   PetscFunctionBegin;
250   MatCheckProduct(Cmpi, 4);
251   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
252   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
253   PetscCallMPI(MPI_Comm_size(comm, &size));
254   PetscCallMPI(MPI_Comm_rank(comm, &rank));
255 
256   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
257 
258   /* create symbolic parallel matrix Cmpi */
259   PetscCall(MatGetType(A, &mtype));
260   PetscCall(MatSetType(Cmpi, mtype));
261 
262   /* create struct Mat_APMPI and attached it to C later */
263   PetscCall(PetscNew(&ptap));
264   ptap->reuse   = MAT_INITIAL_MATRIX;
265   ptap->algType = 0;
266 
267   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
268   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
269   /* get P_loc by taking all local rows of P */
270   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
271 
272   ptap->P_loc = P_loc;
273   ptap->P_oth = P_oth;
274 
275   /* (0) compute Rd = Pd^T, Ro = Po^T  */
276   /* --------------------------------- */
277   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
278   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
279 
280   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
281   /* ----------------------------------------------------------------- */
282   p_loc = (Mat_SeqAIJ *)P_loc->data;
283   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
284 
285   /* create and initialize a linked list */
286   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
287   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
288   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
289   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
290 
291   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
292 
293   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
294   if (ao) {
295     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
296   } else {
297     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
298   }
299   current_space = free_space;
300   nspacedouble  = 0;
301 
302   PetscCall(PetscMalloc1(am + 1, &api));
303   api[0] = 0;
304   for (i = 0; i < am; i++) {
305     /* diagonal portion: Ad[i,:]*P */
306     ai  = ad->i;
307     pi  = p_loc->i;
308     nzi = ai[i + 1] - ai[i];
309     aj  = ad->j + ai[i];
310     for (j = 0; j < nzi; j++) {
311       row  = aj[j];
312       pnz  = pi[row + 1] - pi[row];
313       Jptr = p_loc->j + pi[row];
314       /* add non-zero cols of P into the sorted linked list lnk */
315       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
316     }
317     /* off-diagonal portion: Ao[i,:]*P */
318     if (ao) {
319       ai  = ao->i;
320       pi  = p_oth->i;
321       nzi = ai[i + 1] - ai[i];
322       aj  = ao->j + ai[i];
323       for (j = 0; j < nzi; j++) {
324         row  = aj[j];
325         pnz  = pi[row + 1] - pi[row];
326         Jptr = p_oth->j + pi[row];
327         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
328       }
329     }
330     apnz       = lnk[0];
331     api[i + 1] = api[i] + apnz;
332 
333     /* if free space is not available, double the total space in the list */
334     if (current_space->local_remaining < apnz) {
335       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
336       nspacedouble++;
337     }
338 
339     /* Copy data into free space, then initialize lnk */
340     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
341 
342     current_space->array += apnz;
343     current_space->local_used += apnz;
344     current_space->local_remaining -= apnz;
345   }
346   /* Allocate space for apj and apv, initialize apj, and */
347   /* destroy list of free space and other temporary array(s) */
348   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
349   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
350   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
351 
352   /* Create AP_loc for reuse */
353   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
354   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
355 
356 #if defined(PETSC_USE_INFO)
357   if (ao) {
358     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
359   } else {
360     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
361   }
362   ptap->AP_loc->info.mallocs           = nspacedouble;
363   ptap->AP_loc->info.fill_ratio_given  = fill;
364   ptap->AP_loc->info.fill_ratio_needed = apfill;
365 
366   if (api[am]) {
367     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
368     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
369   } else {
370     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
371   }
372 #endif
373 
374   /* (2-1) compute symbolic Co = Ro*AP_loc  */
375   /* -------------------------------------- */
376   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
377   PetscCall(MatGetOptionsPrefix(A, &prefix));
378   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
379   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
380 
381   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
382   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
383   PetscCall(MatProductSetFill(ptap->C_oth, fill));
384   PetscCall(MatProductSetFromOptions(ptap->C_oth));
385   PetscCall(MatProductSymbolic(ptap->C_oth));
386 
387   /* (3) send coj of C_oth to other processors  */
388   /* ------------------------------------------ */
389   /* determine row ownership */
390   PetscCall(PetscLayoutCreate(comm, &rowmap));
391   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
392   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
393   PetscCall(PetscLayoutSetUp(rowmap));
394   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
395 
396   /* determine the number of messages to send, their lengths */
397   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
398   PetscCall(PetscArrayzero(len_s, size));
399   PetscCall(PetscArrayzero(len_si, size));
400 
401   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
402   coi   = c_oth->i;
403   coj   = c_oth->j;
404   con   = ptap->C_oth->rmap->n;
405   proc  = 0;
406   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
407   for (i = 0; i < con; i++) {
408     while (prmap[i] >= owners[proc + 1]) proc++;
409     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
410     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
411   }
412 
413   len          = 0; /* max length of buf_si[], see (4) */
414   owners_co[0] = 0;
415   nsend        = 0;
416   for (proc = 0; proc < size; proc++) {
417     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
418     if (len_s[proc]) {
419       nsend++;
420       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
421       len += len_si[proc];
422     }
423   }
424 
425   /* determine the number and length of messages to receive for coi and coj  */
426   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
427   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
428 
429   /* post the Irecv and Isend of coj */
430   PetscCall(PetscCommGetNewTag(comm, &tagj));
431   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
432   PetscCall(PetscMalloc1(nsend + 1, &swaits));
433   for (proc = 0, k = 0; proc < size; proc++) {
434     if (!len_s[proc]) continue;
435     i = owners_co[proc];
436     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
437     k++;
438   }
439 
440   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
441   /* ---------------------------------------- */
442   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
443   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
444   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
445   PetscCall(MatProductSetFill(ptap->C_loc, fill));
446 
447   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
448   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
449 
450   PetscCall(MatProductSetFromOptions(ptap->C_loc));
451   PetscCall(MatProductSymbolic(ptap->C_loc));
452 
453   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
454   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
455 
456   /* receives coj are complete */
457   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
458   PetscCall(PetscFree(rwaits));
459   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
460 
461   /* add received column indices into ta to update Crmax */
462   for (k = 0; k < nrecv; k++) { /* k-th received message */
463     Jptr = buf_rj[k];
464     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
465   }
466   PetscCall(PetscHMapIGetSize(ta, &Crmax));
467   PetscCall(PetscHMapIDestroy(&ta));
468 
469   /* (4) send and recv coi */
470   /*-----------------------*/
471   PetscCall(PetscCommGetNewTag(comm, &tagi));
472   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
473   PetscCall(PetscMalloc1(len + 1, &buf_s));
474   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
475   for (proc = 0, k = 0; proc < size; proc++) {
476     if (!len_s[proc]) continue;
477     /* form outgoing message for i-structure:
478          buf_si[0]:                 nrows to be sent
479                [1:nrows]:           row index (global)
480                [nrows+1:2*nrows+1]: i-structure index
481     */
482     /*-------------------------------------------*/
483     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
484     buf_si_i    = buf_si + nrows + 1;
485     buf_si[0]   = nrows;
486     buf_si_i[0] = 0;
487     nrows       = 0;
488     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
489       nzi                 = coi[i + 1] - coi[i];
490       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
491       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
492       nrows++;
493     }
494     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
495     k++;
496     buf_si += len_si[proc];
497   }
498   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
499   PetscCall(PetscFree(rwaits));
500   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
501 
502   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
503   PetscCall(PetscFree(len_ri));
504   PetscCall(PetscFree(swaits));
505   PetscCall(PetscFree(buf_s));
506 
507   /* (5) compute the local portion of Cmpi      */
508   /* ------------------------------------------ */
509   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
510   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
511   current_space = free_space;
512 
513   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
514   for (k = 0; k < nrecv; k++) {
515     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
516     nrows       = *buf_ri_k[k];
517     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
518     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
519   }
520 
521   MatPreallocateBegin(comm, pn, pn, dnz, onz);
522   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
523   for (i = 0; i < pn; i++) {
524     /* add C_loc into Cmpi */
525     nzi  = c_loc->i[i + 1] - c_loc->i[i];
526     Jptr = c_loc->j + c_loc->i[i];
527     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
528 
529     /* add received col data into lnk */
530     for (k = 0; k < nrecv; k++) { /* k-th received message */
531       if (i == *nextrow[k]) {     /* i-th row */
532         nzi  = *(nextci[k] + 1) - *nextci[k];
533         Jptr = buf_rj[k] + *nextci[k];
534         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
535         nextrow[k]++;
536         nextci[k]++;
537       }
538     }
539     nzi = lnk[0];
540 
541     /* copy data into free space, then initialize lnk */
542     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
543     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
544   }
545   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
546   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
547   PetscCall(PetscFreeSpaceDestroy(free_space));
548 
549   /* local sizes and preallocation */
550   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
551   if (P->cmap->bs > 0) {
552     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
553     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
554   }
555   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
556   MatPreallocateEnd(dnz, onz);
557 
558   /* members in merge */
559   PetscCall(PetscFree(id_r));
560   PetscCall(PetscFree(len_r));
561   PetscCall(PetscFree(buf_ri[0]));
562   PetscCall(PetscFree(buf_ri));
563   PetscCall(PetscFree(buf_rj[0]));
564   PetscCall(PetscFree(buf_rj));
565   PetscCall(PetscLayoutDestroy(&rowmap));
566 
567   nout = 0;
568   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
569   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
570   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
571   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
572 
573   /* attach the supporting struct to Cmpi for reuse */
574   Cmpi->product->data    = ptap;
575   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
576   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
577 
578   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
579   Cmpi->assembled        = PETSC_FALSE;
580   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
581   PetscFunctionReturn(PETSC_SUCCESS);
582 }
583 
584 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
585 {
586   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
587   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
588   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
589   PetscInt    pcstart, pcend, column, offset;
590 
591   PetscFunctionBegin;
592   pcstart = P->cmap->rstart;
593   pcstart *= dof;
594   pcend = P->cmap->rend;
595   pcend *= dof;
596   /* diagonal portion: Ad[i,:]*P */
597   ai  = ad->i;
598   nzi = ai[i + 1] - ai[i];
599   aj  = ad->j + ai[i];
600   for (j = 0; j < nzi; j++) {
601     row    = aj[j];
602     offset = row % dof;
603     row /= dof;
604     nzpi = pd->i[row + 1] - pd->i[row];
605     pj   = pd->j + pd->i[row];
606     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
607   }
608   /* off diag P */
609   for (j = 0; j < nzi; j++) {
610     row    = aj[j];
611     offset = row % dof;
612     row /= dof;
613     nzpi = po->i[row + 1] - po->i[row];
614     pj   = po->j + po->i[row];
615     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
616   }
617 
618   /* off diagonal part: Ao[i, :]*P_oth */
619   if (ao) {
620     ai  = ao->i;
621     pi  = p_oth->i;
622     nzi = ai[i + 1] - ai[i];
623     aj  = ao->j + ai[i];
624     for (j = 0; j < nzi; j++) {
625       row       = aj[j];
626       offset    = a->garray[row] % dof;
627       row       = map[row];
628       pnz       = pi[row + 1] - pi[row];
629       p_othcols = p_oth->j + pi[row];
630       for (col = 0; col < pnz; col++) {
631         column = p_othcols[col] * dof + offset;
632         if (column >= pcstart && column < pcend) {
633           PetscCall(PetscHSetIAdd(dht, column));
634         } else {
635           PetscCall(PetscHSetIAdd(oht, column));
636         }
637       }
638     }
639   } /* end if (ao) */
640   PetscFunctionReturn(PETSC_SUCCESS);
641 }
642 
643 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
644 {
645   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
646   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
647   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
648   PetscScalar ra, *aa, *pa;
649 
650   PetscFunctionBegin;
651   pcstart = P->cmap->rstart;
652   pcstart *= dof;
653 
654   /* diagonal portion: Ad[i,:]*P */
655   ai  = ad->i;
656   nzi = ai[i + 1] - ai[i];
657   aj  = ad->j + ai[i];
658   aa  = ad->a + ai[i];
659   for (j = 0; j < nzi; j++) {
660     ra     = aa[j];
661     row    = aj[j];
662     offset = row % dof;
663     row /= dof;
664     nzpi = pd->i[row + 1] - pd->i[row];
665     pj   = pd->j + pd->i[row];
666     pa   = pd->a + pd->i[row];
667     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
668     PetscCall(PetscLogFlops(2.0 * nzpi));
669   }
670   for (j = 0; j < nzi; j++) {
671     ra     = aa[j];
672     row    = aj[j];
673     offset = row % dof;
674     row /= dof;
675     nzpi = po->i[row + 1] - po->i[row];
676     pj   = po->j + po->i[row];
677     pa   = po->a + po->i[row];
678     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
679     PetscCall(PetscLogFlops(2.0 * nzpi));
680   }
681 
682   /* off diagonal part: Ao[i, :]*P_oth */
683   if (ao) {
684     ai  = ao->i;
685     pi  = p_oth->i;
686     nzi = ai[i + 1] - ai[i];
687     aj  = ao->j + ai[i];
688     aa  = ao->a + ai[i];
689     for (j = 0; j < nzi; j++) {
690       row       = aj[j];
691       offset    = a->garray[row] % dof;
692       row       = map[row];
693       ra        = aa[j];
694       pnz       = pi[row + 1] - pi[row];
695       p_othcols = p_oth->j + pi[row];
696       pa        = p_oth->a + pi[row];
697       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
698       PetscCall(PetscLogFlops(2.0 * pnz));
699     }
700   } /* end if (ao) */
701 
702   PetscFunctionReturn(PETSC_SUCCESS);
703 }
704 
705 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
706 
707 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
708 {
709   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
710   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
711   Mat_APMPI      *ptap;
712   PetscHMapIV     hmap;
713   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
714   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
715   PetscInt        offset, ii, pocol;
716   const PetscInt *mappingindices;
717   IS              map;
718 
719   PetscFunctionBegin;
720   MatCheckProduct(C, 4);
721   ptap = (Mat_APMPI *)C->product->data;
722   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
723   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
724 
725   PetscCall(MatZeroEntries(C));
726 
727   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
728   /*-----------------------------------------------------*/
729   if (ptap->reuse == MAT_REUSE_MATRIX) {
730     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
731     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
732   }
733   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
734 
735   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
736   pon *= dof;
737   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
738   PetscCall(MatGetLocalSize(A, &am, NULL));
739   cmaxr = 0;
740   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
741   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
742   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
743   PetscCall(ISGetIndices(map, &mappingindices));
744   for (i = 0; i < am && pon; i++) {
745     PetscCall(PetscHMapIVClear(hmap));
746     offset = i % dof;
747     ii     = i / dof;
748     nzi    = po->i[ii + 1] - po->i[ii];
749     if (!nzi) continue;
750     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
751     voff = 0;
752     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
753     if (!voff) continue;
754 
755     /* Form C(ii, :) */
756     poj = po->j + po->i[ii];
757     poa = po->a + po->i[ii];
758     for (j = 0; j < nzi; j++) {
759       pocol   = poj[j] * dof + offset;
760       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
761       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
762       for (jj = 0; jj < voff; jj++) {
763         apvaluestmp[jj] = apvalues[jj] * poa[j];
764         /* If the row is empty */
765         if (!c_rmtc[pocol]) {
766           c_rmtjj[jj] = apindices[jj];
767           c_rmtaa[jj] = apvaluestmp[jj];
768           c_rmtc[pocol]++;
769         } else {
770           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
771           if (loc >= 0) { /* hit */
772             c_rmtaa[loc] += apvaluestmp[jj];
773             PetscCall(PetscLogFlops(1.0));
774           } else { /* new element */
775             loc = -(loc + 1);
776             /* Move data backward */
777             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
778               c_rmtjj[kk] = c_rmtjj[kk - 1];
779               c_rmtaa[kk] = c_rmtaa[kk - 1];
780             } /* End kk */
781             c_rmtjj[loc] = apindices[jj];
782             c_rmtaa[loc] = apvaluestmp[jj];
783             c_rmtc[pocol]++;
784           }
785         }
786         PetscCall(PetscLogFlops(voff));
787       } /* End jj */
788     }   /* End j */
789   }     /* End i */
790 
791   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
792   PetscCall(PetscHMapIVDestroy(&hmap));
793 
794   PetscCall(MatGetLocalSize(P, NULL, &pn));
795   pn *= dof;
796   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
797 
798   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
799   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
800   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
801   pcstart = pcstart * dof;
802   pcend   = pcend * dof;
803   cd      = (Mat_SeqAIJ *)(c->A)->data;
804   co      = (Mat_SeqAIJ *)(c->B)->data;
805 
806   cmaxr = 0;
807   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
808   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
809   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
810   for (i = 0; i < am && pn; i++) {
811     PetscCall(PetscHMapIVClear(hmap));
812     offset = i % dof;
813     ii     = i / dof;
814     nzi    = pd->i[ii + 1] - pd->i[ii];
815     if (!nzi) continue;
816     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
817     voff = 0;
818     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
819     if (!voff) continue;
820     /* Form C(ii, :) */
821     pdj = pd->j + pd->i[ii];
822     pda = pd->a + pd->i[ii];
823     for (j = 0; j < nzi; j++) {
824       row = pcstart + pdj[j] * dof + offset;
825       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
826       PetscCall(PetscLogFlops(voff));
827       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
828     }
829   }
830   PetscCall(ISRestoreIndices(map, &mappingindices));
831   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
832   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
833   PetscCall(PetscHMapIVDestroy(&hmap));
834   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
835   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
836   PetscCall(PetscFree2(c_rmtj, c_rmta));
837 
838   /* Add contributions from remote */
839   for (i = 0; i < pn; i++) {
840     row = i + pcstart;
841     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
842   }
843   PetscCall(PetscFree2(c_othj, c_otha));
844 
845   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
846   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
847 
848   ptap->reuse = MAT_REUSE_MATRIX;
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
852 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
853 {
854   PetscFunctionBegin;
855 
856   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
857   PetscFunctionReturn(PETSC_SUCCESS);
858 }
859 
860 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
861 {
862   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
863   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
864   Mat_APMPI      *ptap;
865   PetscHMapIV     hmap;
866   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
867   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
868   PetscInt        offset, ii, pocol;
869   const PetscInt *mappingindices;
870   IS              map;
871 
872   PetscFunctionBegin;
873   MatCheckProduct(C, 4);
874   ptap = (Mat_APMPI *)C->product->data;
875   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
876   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
877 
878   PetscCall(MatZeroEntries(C));
879 
880   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
881   /*-----------------------------------------------------*/
882   if (ptap->reuse == MAT_REUSE_MATRIX) {
883     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
884     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
885   }
886   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
887   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
888   pon *= dof;
889   PetscCall(MatGetLocalSize(P, NULL, &pn));
890   pn *= dof;
891 
892   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
893   PetscCall(MatGetLocalSize(A, &am, NULL));
894   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
895   pcstart *= dof;
896   pcend *= dof;
897   cmaxr = 0;
898   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
899   cd = (Mat_SeqAIJ *)(c->A)->data;
900   co = (Mat_SeqAIJ *)(c->B)->data;
901   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
902   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
903   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
904   PetscCall(ISGetIndices(map, &mappingindices));
905   for (i = 0; i < am && (pon || pn); i++) {
906     PetscCall(PetscHMapIVClear(hmap));
907     offset = i % dof;
908     ii     = i / dof;
909     nzi    = po->i[ii + 1] - po->i[ii];
910     dnzi   = pd->i[ii + 1] - pd->i[ii];
911     if (!nzi && !dnzi) continue;
912     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
913     voff = 0;
914     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
915     if (!voff) continue;
916 
917     /* Form remote C(ii, :) */
918     poj = po->j + po->i[ii];
919     poa = po->a + po->i[ii];
920     for (j = 0; j < nzi; j++) {
921       pocol   = poj[j] * dof + offset;
922       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
923       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
924       for (jj = 0; jj < voff; jj++) {
925         apvaluestmp[jj] = apvalues[jj] * poa[j];
926         /* If the row is empty */
927         if (!c_rmtc[pocol]) {
928           c_rmtjj[jj] = apindices[jj];
929           c_rmtaa[jj] = apvaluestmp[jj];
930           c_rmtc[pocol]++;
931         } else {
932           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
933           if (loc >= 0) { /* hit */
934             c_rmtaa[loc] += apvaluestmp[jj];
935             PetscCall(PetscLogFlops(1.0));
936           } else { /* new element */
937             loc = -(loc + 1);
938             /* Move data backward */
939             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
940               c_rmtjj[kk] = c_rmtjj[kk - 1];
941               c_rmtaa[kk] = c_rmtaa[kk - 1];
942             } /* End kk */
943             c_rmtjj[loc] = apindices[jj];
944             c_rmtaa[loc] = apvaluestmp[jj];
945             c_rmtc[pocol]++;
946           }
947         }
948       } /* End jj */
949       PetscCall(PetscLogFlops(voff));
950     } /* End j */
951 
952     /* Form local C(ii, :) */
953     pdj = pd->j + pd->i[ii];
954     pda = pd->a + pd->i[ii];
955     for (j = 0; j < dnzi; j++) {
956       row = pcstart + pdj[j] * dof + offset;
957       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
958       PetscCall(PetscLogFlops(voff));
959       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
960     } /* End j */
961   }   /* End i */
962 
963   PetscCall(ISRestoreIndices(map, &mappingindices));
964   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
965   PetscCall(PetscHMapIVDestroy(&hmap));
966   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
967 
968   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
969   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
970   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
971   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
972   PetscCall(PetscFree2(c_rmtj, c_rmta));
973 
974   /* Add contributions from remote */
975   for (i = 0; i < pn; i++) {
976     row = i + pcstart;
977     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
978   }
979   PetscCall(PetscFree2(c_othj, c_otha));
980 
981   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
982   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
983 
984   ptap->reuse = MAT_REUSE_MATRIX;
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 
988 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
989 {
990   PetscFunctionBegin;
991 
992   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
993   PetscFunctionReturn(PETSC_SUCCESS);
994 }
995 
996 /* TODO: move algorithm selection to MatProductSetFromOptions */
997 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
998 {
999   Mat_APMPI      *ptap;
1000   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1001   MPI_Comm        comm;
1002   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1003   MatType         mtype;
1004   PetscSF         sf;
1005   PetscSFNode    *iremote;
1006   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1007   const PetscInt *rootdegrees;
1008   PetscHSetI      ht, oht, *hta, *hto;
1009   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1010   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1011   PetscInt        nalg = 2, alg = 0, offset, ii;
1012   PetscMPIInt     owner;
1013   const PetscInt *mappingindices;
1014   PetscBool       flg;
1015   const char     *algTypes[2] = {"overlapping", "merged"};
1016   IS              map;
1017 
1018   PetscFunctionBegin;
1019   MatCheckProduct(Cmpi, 5);
1020   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1021   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1022 
1023   /* Create symbolic parallel matrix Cmpi */
1024   PetscCall(MatGetLocalSize(P, NULL, &pn));
1025   pn *= dof;
1026   PetscCall(MatGetType(A, &mtype));
1027   PetscCall(MatSetType(Cmpi, mtype));
1028   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1029 
1030   PetscCall(PetscNew(&ptap));
1031   ptap->reuse   = MAT_INITIAL_MATRIX;
1032   ptap->algType = 2;
1033 
1034   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1035   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1036   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1037   /* This equals to the number of offdiag columns in P */
1038   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1039   pon *= dof;
1040   /* offsets */
1041   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1042   /* The number of columns we will send to remote ranks */
1043   PetscCall(PetscMalloc1(pon, &c_rmtc));
1044   PetscCall(PetscMalloc1(pon, &hta));
1045   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1046   PetscCall(MatGetLocalSize(A, &am, NULL));
1047   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1048   /* Create hash table to merge all columns for C(i, :) */
1049   PetscCall(PetscHSetICreate(&ht));
1050 
1051   PetscCall(ISGetIndices(map, &mappingindices));
1052   ptap->c_rmti[0] = 0;
1053   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1054   for (i = 0; i < am && pon; i++) {
1055     /* Form one row of AP */
1056     PetscCall(PetscHSetIClear(ht));
1057     offset = i % dof;
1058     ii     = i / dof;
1059     /* If the off diag is empty, we should not do any calculation */
1060     nzi = po->i[ii + 1] - po->i[ii];
1061     if (!nzi) continue;
1062 
1063     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1064     PetscCall(PetscHSetIGetSize(ht, &htsize));
1065     /* If AP is empty, just continue */
1066     if (!htsize) continue;
1067     /* Form C(ii, :) */
1068     poj = po->j + po->i[ii];
1069     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1070   }
1071 
1072   for (i = 0; i < pon; i++) {
1073     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1074     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1075     c_rmtc[i]           = htsize;
1076   }
1077 
1078   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1079 
1080   for (i = 0; i < pon; i++) {
1081     off = 0;
1082     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1083     PetscCall(PetscHSetIDestroy(&hta[i]));
1084   }
1085   PetscCall(PetscFree(hta));
1086 
1087   PetscCall(PetscMalloc1(pon, &iremote));
1088   for (i = 0; i < pon; i++) {
1089     owner  = 0;
1090     lidx   = 0;
1091     offset = i % dof;
1092     ii     = i / dof;
1093     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1094     iremote[i].index = lidx * dof + offset;
1095     iremote[i].rank  = owner;
1096   }
1097 
1098   PetscCall(PetscSFCreate(comm, &sf));
1099   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1100   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1101   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1102   PetscCall(PetscSFSetFromOptions(sf));
1103   PetscCall(PetscSFSetUp(sf));
1104   /* How many neighbors have contributions to my rows? */
1105   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1106   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1107   rootspacesize = 0;
1108   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1109   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1110   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1111   /* Get information from leaves
1112    * Number of columns other people contribute to my rows
1113    * */
1114   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1115   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1116   PetscCall(PetscFree(c_rmtc));
1117   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1118   /* The number of columns is received for each row */
1119   ptap->c_othi[0]     = 0;
1120   rootspacesize       = 0;
1121   rootspaceoffsets[0] = 0;
1122   for (i = 0; i < pn; i++) {
1123     rcvncols = 0;
1124     for (j = 0; j < rootdegrees[i]; j++) {
1125       rcvncols += rootspace[rootspacesize];
1126       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1127       rootspacesize++;
1128     }
1129     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1130   }
1131   PetscCall(PetscFree(rootspace));
1132 
1133   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1134   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1135   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1136   PetscCall(PetscSFDestroy(&sf));
1137   PetscCall(PetscFree(rootspaceoffsets));
1138 
1139   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1140   nleaves = 0;
1141   for (i = 0; i < pon; i++) {
1142     owner = 0;
1143     ii    = i / dof;
1144     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1145     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1146     for (j = 0; j < sendncols; j++) {
1147       iremote[nleaves].rank    = owner;
1148       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1149     }
1150   }
1151   PetscCall(PetscFree(c_rmtoffsets));
1152   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1153 
1154   PetscCall(PetscSFCreate(comm, &ptap->sf));
1155   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1156   PetscCall(PetscSFSetFromOptions(ptap->sf));
1157   /* One to one map */
1158   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1159 
1160   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1161   PetscCall(PetscHSetICreate(&oht));
1162   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1163   pcstart *= dof;
1164   pcend *= dof;
1165   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1166   for (i = 0; i < pn; i++) {
1167     PetscCall(PetscHSetICreate(&hta[i]));
1168     PetscCall(PetscHSetICreate(&hto[i]));
1169   }
1170   /* Work on local part */
1171   /* 4) Pass 1: Estimate memory for C_loc */
1172   for (i = 0; i < am && pn; i++) {
1173     PetscCall(PetscHSetIClear(ht));
1174     PetscCall(PetscHSetIClear(oht));
1175     offset = i % dof;
1176     ii     = i / dof;
1177     nzi    = pd->i[ii + 1] - pd->i[ii];
1178     if (!nzi) continue;
1179 
1180     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1181     PetscCall(PetscHSetIGetSize(ht, &htsize));
1182     PetscCall(PetscHSetIGetSize(oht, &htosize));
1183     if (!(htsize + htosize)) continue;
1184     /* Form C(ii, :) */
1185     pdj = pd->j + pd->i[ii];
1186     for (j = 0; j < nzi; j++) {
1187       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1188       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1189     }
1190   }
1191 
1192   PetscCall(ISRestoreIndices(map, &mappingindices));
1193 
1194   PetscCall(PetscHSetIDestroy(&ht));
1195   PetscCall(PetscHSetIDestroy(&oht));
1196 
1197   /* Get remote data */
1198   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1199   PetscCall(PetscFree(c_rmtj));
1200 
1201   for (i = 0; i < pn; i++) {
1202     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1203     rdj = c_othj + ptap->c_othi[i];
1204     for (j = 0; j < nzi; j++) {
1205       col = rdj[j];
1206       /* diag part */
1207       if (col >= pcstart && col < pcend) {
1208         PetscCall(PetscHSetIAdd(hta[i], col));
1209       } else { /* off diag */
1210         PetscCall(PetscHSetIAdd(hto[i], col));
1211       }
1212     }
1213     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1214     dnz[i] = htsize;
1215     PetscCall(PetscHSetIDestroy(&hta[i]));
1216     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1217     onz[i] = htsize;
1218     PetscCall(PetscHSetIDestroy(&hto[i]));
1219   }
1220 
1221   PetscCall(PetscFree2(hta, hto));
1222   PetscCall(PetscFree(c_othj));
1223 
1224   /* local sizes and preallocation */
1225   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1226   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1227   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1228   PetscCall(MatSetUp(Cmpi));
1229   PetscCall(PetscFree2(dnz, onz));
1230 
1231   /* attach the supporting struct to Cmpi for reuse */
1232   Cmpi->product->data    = ptap;
1233   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1234   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1235 
1236   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1237   Cmpi->assembled = PETSC_FALSE;
1238   /* pick an algorithm */
1239   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1240   alg = 0;
1241   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1242   PetscOptionsEnd();
1243   switch (alg) {
1244   case 0:
1245     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1246     break;
1247   case 1:
1248     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1249     break;
1250   default:
1251     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1252   }
1253   PetscFunctionReturn(PETSC_SUCCESS);
1254 }
1255 
1256 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1257 {
1258   PetscFunctionBegin;
1259   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1260   PetscFunctionReturn(PETSC_SUCCESS);
1261 }
1262 
1263 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1264 {
1265   Mat_APMPI      *ptap;
1266   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1267   MPI_Comm        comm;
1268   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1269   MatType         mtype;
1270   PetscSF         sf;
1271   PetscSFNode    *iremote;
1272   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1273   const PetscInt *rootdegrees;
1274   PetscHSetI      ht, oht, *hta, *hto, *htd;
1275   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1276   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1277   PetscInt        nalg = 2, alg = 0, offset, ii;
1278   PetscMPIInt     owner;
1279   PetscBool       flg;
1280   const char     *algTypes[2] = {"merged", "overlapping"};
1281   const PetscInt *mappingindices;
1282   IS              map;
1283 
1284   PetscFunctionBegin;
1285   MatCheckProduct(Cmpi, 5);
1286   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1287   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1288 
1289   /* Create symbolic parallel matrix Cmpi */
1290   PetscCall(MatGetLocalSize(P, NULL, &pn));
1291   pn *= dof;
1292   PetscCall(MatGetType(A, &mtype));
1293   PetscCall(MatSetType(Cmpi, mtype));
1294   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1295 
1296   PetscCall(PetscNew(&ptap));
1297   ptap->reuse   = MAT_INITIAL_MATRIX;
1298   ptap->algType = 3;
1299 
1300   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1301   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1302   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1303 
1304   /* This equals to the number of offdiag columns in P */
1305   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1306   pon *= dof;
1307   /* offsets */
1308   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1309   /* The number of columns we will send to remote ranks */
1310   PetscCall(PetscMalloc1(pon, &c_rmtc));
1311   PetscCall(PetscMalloc1(pon, &hta));
1312   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1313   PetscCall(MatGetLocalSize(A, &am, NULL));
1314   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1315   /* Create hash table to merge all columns for C(i, :) */
1316   PetscCall(PetscHSetICreate(&ht));
1317   PetscCall(PetscHSetICreate(&oht));
1318   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1319   for (i = 0; i < pn; i++) {
1320     PetscCall(PetscHSetICreate(&htd[i]));
1321     PetscCall(PetscHSetICreate(&hto[i]));
1322   }
1323 
1324   PetscCall(ISGetIndices(map, &mappingindices));
1325   ptap->c_rmti[0] = 0;
1326   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1327   for (i = 0; i < am && (pon || pn); i++) {
1328     /* Form one row of AP */
1329     PetscCall(PetscHSetIClear(ht));
1330     PetscCall(PetscHSetIClear(oht));
1331     offset = i % dof;
1332     ii     = i / dof;
1333     /* If the off diag is empty, we should not do any calculation */
1334     nzi  = po->i[ii + 1] - po->i[ii];
1335     dnzi = pd->i[ii + 1] - pd->i[ii];
1336     if (!nzi && !dnzi) continue;
1337 
1338     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1339     PetscCall(PetscHSetIGetSize(ht, &htsize));
1340     PetscCall(PetscHSetIGetSize(oht, &htosize));
1341     /* If AP is empty, just continue */
1342     if (!(htsize + htosize)) continue;
1343 
1344     /* Form remote C(ii, :) */
1345     poj = po->j + po->i[ii];
1346     for (j = 0; j < nzi; j++) {
1347       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1348       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1349     }
1350 
1351     /* Form local C(ii, :) */
1352     pdj = pd->j + pd->i[ii];
1353     for (j = 0; j < dnzi; j++) {
1354       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1355       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1356     }
1357   }
1358 
1359   PetscCall(ISRestoreIndices(map, &mappingindices));
1360 
1361   PetscCall(PetscHSetIDestroy(&ht));
1362   PetscCall(PetscHSetIDestroy(&oht));
1363 
1364   for (i = 0; i < pon; i++) {
1365     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1366     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1367     c_rmtc[i]           = htsize;
1368   }
1369 
1370   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1371 
1372   for (i = 0; i < pon; i++) {
1373     off = 0;
1374     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1375     PetscCall(PetscHSetIDestroy(&hta[i]));
1376   }
1377   PetscCall(PetscFree(hta));
1378 
1379   PetscCall(PetscMalloc1(pon, &iremote));
1380   for (i = 0; i < pon; i++) {
1381     owner  = 0;
1382     lidx   = 0;
1383     offset = i % dof;
1384     ii     = i / dof;
1385     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1386     iremote[i].index = lidx * dof + offset;
1387     iremote[i].rank  = owner;
1388   }
1389 
1390   PetscCall(PetscSFCreate(comm, &sf));
1391   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1392   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1393   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1394   PetscCall(PetscSFSetFromOptions(sf));
1395   PetscCall(PetscSFSetUp(sf));
1396   /* How many neighbors have contributions to my rows? */
1397   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1398   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1399   rootspacesize = 0;
1400   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1401   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1402   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1403   /* Get information from leaves
1404    * Number of columns other people contribute to my rows
1405    * */
1406   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1407   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1408   PetscCall(PetscFree(c_rmtc));
1409   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1410   /* The number of columns is received for each row */
1411   ptap->c_othi[0]     = 0;
1412   rootspacesize       = 0;
1413   rootspaceoffsets[0] = 0;
1414   for (i = 0; i < pn; i++) {
1415     rcvncols = 0;
1416     for (j = 0; j < rootdegrees[i]; j++) {
1417       rcvncols += rootspace[rootspacesize];
1418       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1419       rootspacesize++;
1420     }
1421     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1422   }
1423   PetscCall(PetscFree(rootspace));
1424 
1425   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1426   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1427   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1428   PetscCall(PetscSFDestroy(&sf));
1429   PetscCall(PetscFree(rootspaceoffsets));
1430 
1431   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1432   nleaves = 0;
1433   for (i = 0; i < pon; i++) {
1434     owner = 0;
1435     ii    = i / dof;
1436     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1437     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1438     for (j = 0; j < sendncols; j++) {
1439       iremote[nleaves].rank    = owner;
1440       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1441     }
1442   }
1443   PetscCall(PetscFree(c_rmtoffsets));
1444   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1445 
1446   PetscCall(PetscSFCreate(comm, &ptap->sf));
1447   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1448   PetscCall(PetscSFSetFromOptions(ptap->sf));
1449   /* One to one map */
1450   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1451   /* Get remote data */
1452   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1453   PetscCall(PetscFree(c_rmtj));
1454   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1455   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1456   pcstart *= dof;
1457   pcend *= dof;
1458   for (i = 0; i < pn; i++) {
1459     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1460     rdj = c_othj + ptap->c_othi[i];
1461     for (j = 0; j < nzi; j++) {
1462       col = rdj[j];
1463       /* diag part */
1464       if (col >= pcstart && col < pcend) {
1465         PetscCall(PetscHSetIAdd(htd[i], col));
1466       } else { /* off diag */
1467         PetscCall(PetscHSetIAdd(hto[i], col));
1468       }
1469     }
1470     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1471     dnz[i] = htsize;
1472     PetscCall(PetscHSetIDestroy(&htd[i]));
1473     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1474     onz[i] = htsize;
1475     PetscCall(PetscHSetIDestroy(&hto[i]));
1476   }
1477 
1478   PetscCall(PetscFree2(htd, hto));
1479   PetscCall(PetscFree(c_othj));
1480 
1481   /* local sizes and preallocation */
1482   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1483   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1484   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1485   PetscCall(PetscFree2(dnz, onz));
1486 
1487   /* attach the supporting struct to Cmpi for reuse */
1488   Cmpi->product->data    = ptap;
1489   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1490   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1491 
1492   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1493   Cmpi->assembled = PETSC_FALSE;
1494   /* pick an algorithm */
1495   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1496   alg = 0;
1497   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1498   PetscOptionsEnd();
1499   switch (alg) {
1500   case 0:
1501     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1502     break;
1503   case 1:
1504     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1505     break;
1506   default:
1507     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1508   }
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 
1512 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1513 {
1514   PetscFunctionBegin;
1515   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1516   PetscFunctionReturn(PETSC_SUCCESS);
1517 }
1518 
1519 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1520 {
1521   Mat_APMPI               *ptap;
1522   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1523   MPI_Comm                 comm;
1524   PetscMPIInt              size, rank;
1525   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1526   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1527   PetscInt                *lnk, i, k, pnz, row, nsend;
1528   PetscBT                  lnkbt;
1529   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1530   PETSC_UNUSED PetscMPIInt icompleted = 0;
1531   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1532   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1533   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1534   MPI_Request             *swaits, *rwaits;
1535   MPI_Status              *sstatus, rstatus;
1536   PetscLayout              rowmap;
1537   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1538   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1539   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1540   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1541   PetscScalar             *apv;
1542   PetscHMapI               ta;
1543   MatType                  mtype;
1544   const char              *prefix;
1545 #if defined(PETSC_USE_INFO)
1546   PetscReal apfill;
1547 #endif
1548 
1549   PetscFunctionBegin;
1550   MatCheckProduct(Cmpi, 4);
1551   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1552   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1553   PetscCallMPI(MPI_Comm_size(comm, &size));
1554   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1555 
1556   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1557 
1558   /* create symbolic parallel matrix Cmpi */
1559   PetscCall(MatGetType(A, &mtype));
1560   PetscCall(MatSetType(Cmpi, mtype));
1561 
1562   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1563   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1564 
1565   /* create struct Mat_APMPI and attached it to C later */
1566   PetscCall(PetscNew(&ptap));
1567   ptap->reuse   = MAT_INITIAL_MATRIX;
1568   ptap->algType = 1;
1569 
1570   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1571   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1572   /* get P_loc by taking all local rows of P */
1573   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1574 
1575   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1576   /* --------------------------------- */
1577   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1578   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1579 
1580   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1581   /* ----------------------------------------------------------------- */
1582   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1583   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1584 
1585   /* create and initialize a linked list */
1586   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1587   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1588   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1589   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1590   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1591 
1592   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1593 
1594   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1595   if (ao) {
1596     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1597   } else {
1598     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1599   }
1600   current_space = free_space;
1601   nspacedouble  = 0;
1602 
1603   PetscCall(PetscMalloc1(am + 1, &api));
1604   api[0] = 0;
1605   for (i = 0; i < am; i++) {
1606     /* diagonal portion: Ad[i,:]*P */
1607     ai  = ad->i;
1608     pi  = p_loc->i;
1609     nzi = ai[i + 1] - ai[i];
1610     aj  = ad->j + ai[i];
1611     for (j = 0; j < nzi; j++) {
1612       row  = aj[j];
1613       pnz  = pi[row + 1] - pi[row];
1614       Jptr = p_loc->j + pi[row];
1615       /* add non-zero cols of P into the sorted linked list lnk */
1616       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1617     }
1618     /* off-diagonal portion: Ao[i,:]*P */
1619     if (ao) {
1620       ai  = ao->i;
1621       pi  = p_oth->i;
1622       nzi = ai[i + 1] - ai[i];
1623       aj  = ao->j + ai[i];
1624       for (j = 0; j < nzi; j++) {
1625         row  = aj[j];
1626         pnz  = pi[row + 1] - pi[row];
1627         Jptr = p_oth->j + pi[row];
1628         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1629       }
1630     }
1631     apnz       = lnk[0];
1632     api[i + 1] = api[i] + apnz;
1633     if (ap_rmax < apnz) ap_rmax = apnz;
1634 
1635     /* if free space is not available, double the total space in the list */
1636     if (current_space->local_remaining < apnz) {
1637       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1638       nspacedouble++;
1639     }
1640 
1641     /* Copy data into free space, then initialize lnk */
1642     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1643 
1644     current_space->array += apnz;
1645     current_space->local_used += apnz;
1646     current_space->local_remaining -= apnz;
1647   }
1648   /* Allocate space for apj and apv, initialize apj, and */
1649   /* destroy list of free space and other temporary array(s) */
1650   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1651   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1652   PetscCall(PetscLLDestroy(lnk, lnkbt));
1653 
1654   /* Create AP_loc for reuse */
1655   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1656   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1657 #if defined(PETSC_USE_INFO)
1658   if (ao) {
1659     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1660   } else {
1661     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1662   }
1663   ptap->AP_loc->info.mallocs           = nspacedouble;
1664   ptap->AP_loc->info.fill_ratio_given  = fill;
1665   ptap->AP_loc->info.fill_ratio_needed = apfill;
1666 
1667   if (api[am]) {
1668     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1669     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1670   } else {
1671     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1672   }
1673 #endif
1674 
1675   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1676   /* ------------------------------------ */
1677   PetscCall(MatGetOptionsPrefix(A, &prefix));
1678   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1679   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1680   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1681   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1682   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1683   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1684   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1685   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1686   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1687   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1688   PetscCall(MatProductSymbolic(ptap->C_oth));
1689 
1690   /* (3) send coj of C_oth to other processors  */
1691   /* ------------------------------------------ */
1692   /* determine row ownership */
1693   PetscCall(PetscLayoutCreate(comm, &rowmap));
1694   rowmap->n  = pn;
1695   rowmap->bs = 1;
1696   PetscCall(PetscLayoutSetUp(rowmap));
1697   owners = rowmap->range;
1698 
1699   /* determine the number of messages to send, their lengths */
1700   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1701   PetscCall(PetscArrayzero(len_s, size));
1702   PetscCall(PetscArrayzero(len_si, size));
1703 
1704   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1705   coi   = c_oth->i;
1706   coj   = c_oth->j;
1707   con   = ptap->C_oth->rmap->n;
1708   proc  = 0;
1709   for (i = 0; i < con; i++) {
1710     while (prmap[i] >= owners[proc + 1]) proc++;
1711     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1712     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1713   }
1714 
1715   len          = 0; /* max length of buf_si[], see (4) */
1716   owners_co[0] = 0;
1717   nsend        = 0;
1718   for (proc = 0; proc < size; proc++) {
1719     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1720     if (len_s[proc]) {
1721       nsend++;
1722       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1723       len += len_si[proc];
1724     }
1725   }
1726 
1727   /* determine the number and length of messages to receive for coi and coj  */
1728   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1729   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1730 
1731   /* post the Irecv and Isend of coj */
1732   PetscCall(PetscCommGetNewTag(comm, &tagj));
1733   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1734   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1735   for (proc = 0, k = 0; proc < size; proc++) {
1736     if (!len_s[proc]) continue;
1737     i = owners_co[proc];
1738     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1739     k++;
1740   }
1741 
1742   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1743   /* ---------------------------------------- */
1744   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1745   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1746   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1747   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1748   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1749   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1750   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1751   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1752   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1753   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1754   PetscCall(MatProductSymbolic(ptap->C_loc));
1755 
1756   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1757 
1758   /* receives coj are complete */
1759   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1760   PetscCall(PetscFree(rwaits));
1761   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1762 
1763   /* add received column indices into ta to update Crmax */
1764   for (k = 0; k < nrecv; k++) { /* k-th received message */
1765     Jptr = buf_rj[k];
1766     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1767   }
1768   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1769   PetscCall(PetscHMapIDestroy(&ta));
1770 
1771   /* (4) send and recv coi */
1772   /*-----------------------*/
1773   PetscCall(PetscCommGetNewTag(comm, &tagi));
1774   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1775   PetscCall(PetscMalloc1(len + 1, &buf_s));
1776   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1777   for (proc = 0, k = 0; proc < size; proc++) {
1778     if (!len_s[proc]) continue;
1779     /* form outgoing message for i-structure:
1780          buf_si[0]:                 nrows to be sent
1781                [1:nrows]:           row index (global)
1782                [nrows+1:2*nrows+1]: i-structure index
1783     */
1784     /*-------------------------------------------*/
1785     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1786     buf_si_i    = buf_si + nrows + 1;
1787     buf_si[0]   = nrows;
1788     buf_si_i[0] = 0;
1789     nrows       = 0;
1790     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1791       nzi                 = coi[i + 1] - coi[i];
1792       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1793       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1794       nrows++;
1795     }
1796     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1797     k++;
1798     buf_si += len_si[proc];
1799   }
1800   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1801   PetscCall(PetscFree(rwaits));
1802   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1803 
1804   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1805   PetscCall(PetscFree(len_ri));
1806   PetscCall(PetscFree(swaits));
1807   PetscCall(PetscFree(buf_s));
1808 
1809   /* (5) compute the local portion of Cmpi      */
1810   /* ------------------------------------------ */
1811   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1812   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1813   current_space = free_space;
1814 
1815   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1816   for (k = 0; k < nrecv; k++) {
1817     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1818     nrows       = *buf_ri_k[k];
1819     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1820     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1821   }
1822 
1823   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1824   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1825   for (i = 0; i < pn; i++) {
1826     /* add C_loc into Cmpi */
1827     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1828     Jptr = c_loc->j + c_loc->i[i];
1829     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1830 
1831     /* add received col data into lnk */
1832     for (k = 0; k < nrecv; k++) { /* k-th received message */
1833       if (i == *nextrow[k]) {     /* i-th row */
1834         nzi  = *(nextci[k] + 1) - *nextci[k];
1835         Jptr = buf_rj[k] + *nextci[k];
1836         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1837         nextrow[k]++;
1838         nextci[k]++;
1839       }
1840     }
1841     nzi = lnk[0];
1842 
1843     /* copy data into free space, then initialize lnk */
1844     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1845     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1846   }
1847   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1848   PetscCall(PetscLLDestroy(lnk, lnkbt));
1849   PetscCall(PetscFreeSpaceDestroy(free_space));
1850 
1851   /* local sizes and preallocation */
1852   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1853   if (P->cmap->bs > 0) {
1854     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1855     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1856   }
1857   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1858   MatPreallocateEnd(dnz, onz);
1859 
1860   /* members in merge */
1861   PetscCall(PetscFree(id_r));
1862   PetscCall(PetscFree(len_r));
1863   PetscCall(PetscFree(buf_ri[0]));
1864   PetscCall(PetscFree(buf_ri));
1865   PetscCall(PetscFree(buf_rj[0]));
1866   PetscCall(PetscFree(buf_rj));
1867   PetscCall(PetscLayoutDestroy(&rowmap));
1868 
1869   PetscCall(PetscCalloc1(pN, &ptap->apa));
1870 
1871   /* attach the supporting struct to Cmpi for reuse */
1872   Cmpi->product->data    = ptap;
1873   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1874   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1875 
1876   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1877   Cmpi->assembled = PETSC_FALSE;
1878   PetscFunctionReturn(PETSC_SUCCESS);
1879 }
1880 
1881 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1882 {
1883   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1884   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1885   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1886   Mat_APMPI         *ptap;
1887   Mat                AP_loc, C_loc, C_oth;
1888   PetscInt           i, rstart, rend, cm, ncols, row;
1889   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1890   PetscScalar       *apa;
1891   const PetscInt    *cols;
1892   const PetscScalar *vals;
1893 
1894   PetscFunctionBegin;
1895   MatCheckProduct(C, 3);
1896   ptap = (Mat_APMPI *)C->product->data;
1897   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1898   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1899 
1900   PetscCall(MatZeroEntries(C));
1901   /* 1) get R = Pd^T,Ro = Po^T */
1902   if (ptap->reuse == MAT_REUSE_MATRIX) {
1903     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1904     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1905   }
1906 
1907   /* 2) get AP_loc */
1908   AP_loc = ptap->AP_loc;
1909   ap     = (Mat_SeqAIJ *)AP_loc->data;
1910 
1911   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1912   /*-----------------------------------------------------*/
1913   if (ptap->reuse == MAT_REUSE_MATRIX) {
1914     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1915     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1916     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1917   }
1918 
1919   /* 2-2) compute numeric A_loc*P - dominating part */
1920   /* ---------------------------------------------- */
1921   /* get data from symbolic products */
1922   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1923   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1924   apa = ptap->apa;
1925   api = ap->i;
1926   apj = ap->j;
1927   for (i = 0; i < am; i++) {
1928     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1929     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1930     apnz = api[i + 1] - api[i];
1931     for (j = 0; j < apnz; j++) {
1932       col                 = apj[j + api[i]];
1933       ap->a[j + ap->i[i]] = apa[col];
1934       apa[col]            = 0.0;
1935     }
1936   }
1937   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1938   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1939 
1940   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1941   PetscCall(MatProductNumeric(ptap->C_loc));
1942   PetscCall(MatProductNumeric(ptap->C_oth));
1943   C_loc = ptap->C_loc;
1944   C_oth = ptap->C_oth;
1945 
1946   /* add C_loc and Co to to C */
1947   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1948 
1949   /* C_loc -> C */
1950   cm    = C_loc->rmap->N;
1951   c_seq = (Mat_SeqAIJ *)C_loc->data;
1952   cols  = c_seq->j;
1953   vals  = c_seq->a;
1954 
1955   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1956   /* when there are no off-processor parts.  */
1957   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1958   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1959   /* a table, and other, more complex stuff has to be done. */
1960   if (C->assembled) {
1961     C->was_assembled = PETSC_TRUE;
1962     C->assembled     = PETSC_FALSE;
1963   }
1964   if (C->was_assembled) {
1965     for (i = 0; i < cm; i++) {
1966       ncols = c_seq->i[i + 1] - c_seq->i[i];
1967       row   = rstart + i;
1968       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1969       cols += ncols;
1970       vals += ncols;
1971     }
1972   } else {
1973     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1974   }
1975 
1976   /* Co -> C, off-processor part */
1977   cm    = C_oth->rmap->N;
1978   c_seq = (Mat_SeqAIJ *)C_oth->data;
1979   cols  = c_seq->j;
1980   vals  = c_seq->a;
1981   for (i = 0; i < cm; i++) {
1982     ncols = c_seq->i[i + 1] - c_seq->i[i];
1983     row   = p->garray[i];
1984     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1985     cols += ncols;
1986     vals += ncols;
1987   }
1988 
1989   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1990   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1991 
1992   ptap->reuse = MAT_REUSE_MATRIX;
1993   PetscFunctionReturn(PETSC_SUCCESS);
1994 }
1995 
1996 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1997 {
1998   Mat_Product        *product = C->product;
1999   Mat                 A = product->A, P = product->B;
2000   MatProductAlgorithm alg  = product->alg;
2001   PetscReal           fill = product->fill;
2002   PetscBool           flg;
2003 
2004   PetscFunctionBegin;
2005   /* scalable: do R=P^T locally, then C=R*A*P */
2006   PetscCall(PetscStrcmp(alg, "scalable", &flg));
2007   if (flg) {
2008     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
2009     C->ops->productnumeric = MatProductNumeric_PtAP;
2010     goto next;
2011   }
2012 
2013   /* nonscalable: do R=P^T locally, then C=R*A*P */
2014   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
2015   if (flg) {
2016     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
2017     goto next;
2018   }
2019 
2020   /* allatonce */
2021   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
2022   if (flg) {
2023     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
2024     goto next;
2025   }
2026 
2027   /* allatonce_merged */
2028   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2029   if (flg) {
2030     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2031     goto next;
2032   }
2033 
2034   /* backend general code */
2035   PetscCall(PetscStrcmp(alg, "backend", &flg));
2036   if (flg) {
2037     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2038     PetscFunctionReturn(PETSC_SUCCESS);
2039   }
2040 
2041   /* hypre */
2042 #if defined(PETSC_HAVE_HYPRE)
2043   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2044   if (flg) {
2045     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2046     C->ops->productnumeric = MatProductNumeric_PtAP;
2047     PetscFunctionReturn(PETSC_SUCCESS);
2048   }
2049 #endif
2050   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2051 
2052 next:
2053   C->ops->productnumeric = MatProductNumeric_PtAP;
2054   PetscFunctionReturn(PETSC_SUCCESS);
2055 }
2056