xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17 {
18   PetscBool         iascii;
19   PetscViewerFormat format;
20   Mat_APMPI        *ptap;
21 
22   PetscFunctionBegin;
23   MatCheckProduct(A, 1);
24   ptap = (Mat_APMPI *)A->product->data;
25   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26   if (iascii) {
27     PetscCall(PetscViewerGetFormat(viewer, &format));
28     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29       if (ptap->algType == 0) {
30         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31       } else if (ptap->algType == 1) {
32         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
33       } else if (ptap->algType == 2) {
34         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
35       } else if (ptap->algType == 3) {
36         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37       }
38     }
39   }
40   PetscFunctionReturn(PETSC_SUCCESS);
41 }
42 
43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44 {
45   Mat_APMPI           *ptap = (Mat_APMPI *)data;
46   Mat_Merge_SeqsToMPI *merge;
47 
48   PetscFunctionBegin;
49   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
50   PetscCall(PetscFree(ptap->bufa));
51   PetscCall(MatDestroy(&ptap->P_loc));
52   PetscCall(MatDestroy(&ptap->P_oth));
53   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
54   PetscCall(MatDestroy(&ptap->Rd));
55   PetscCall(MatDestroy(&ptap->Ro));
56   if (ptap->AP_loc) { /* used by alg_rap */
57     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
58     PetscCall(PetscFree(ap->i));
59     PetscCall(PetscFree2(ap->j, ap->a));
60     PetscCall(MatDestroy(&ptap->AP_loc));
61   } else { /* used by alg_ptap */
62     PetscCall(PetscFree(ptap->api));
63     PetscCall(PetscFree(ptap->apj));
64   }
65   PetscCall(MatDestroy(&ptap->C_loc));
66   PetscCall(MatDestroy(&ptap->C_oth));
67   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68 
69   PetscCall(MatDestroy(&ptap->Pt));
70 
71   merge = ptap->merge;
72   if (merge) { /* used by alg_ptap */
73     PetscCall(PetscFree(merge->id_r));
74     PetscCall(PetscFree(merge->len_s));
75     PetscCall(PetscFree(merge->len_r));
76     PetscCall(PetscFree(merge->bi));
77     PetscCall(PetscFree(merge->bj));
78     PetscCall(PetscFree(merge->buf_ri[0]));
79     PetscCall(PetscFree(merge->buf_ri));
80     PetscCall(PetscFree(merge->buf_rj[0]));
81     PetscCall(PetscFree(merge->buf_rj));
82     PetscCall(PetscFree(merge->coi));
83     PetscCall(PetscFree(merge->coj));
84     PetscCall(PetscFree(merge->owners_co));
85     PetscCall(PetscLayoutDestroy(&merge->rowmap));
86     PetscCall(PetscFree(ptap->merge));
87   }
88   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
89 
90   PetscCall(PetscSFDestroy(&ptap->sf));
91   PetscCall(PetscFree(ptap->c_othi));
92   PetscCall(PetscFree(ptap->c_rmti));
93   PetscCall(PetscFree(ptap));
94   PetscFunctionReturn(PETSC_SUCCESS);
95 }
96 
97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98 {
99   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
101   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
102   Mat_APMPI         *ptap;
103   Mat                AP_loc, C_loc, C_oth;
104   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105   PetscScalar       *apa;
106   const PetscInt    *cols;
107   const PetscScalar *vals;
108 
109   PetscFunctionBegin;
110   MatCheckProduct(C, 3);
111   ptap = (Mat_APMPI *)C->product->data;
112   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
113   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
114 
115   PetscCall(MatZeroEntries(C));
116 
117   /* 1) get R = Pd^T,Ro = Po^T */
118   if (ptap->reuse == MAT_REUSE_MATRIX) {
119     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
120     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121   }
122 
123   /* 2) get AP_loc */
124   AP_loc = ptap->AP_loc;
125   ap     = (Mat_SeqAIJ *)AP_loc->data;
126 
127   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128   if (ptap->reuse == MAT_REUSE_MATRIX) {
129     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
130     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
131     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
132   }
133 
134   /* 2-2) compute numeric A_loc*P - dominating part */
135   /* get data from symbolic products */
136   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
137   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
138 
139   api = ap->i;
140   apj = ap->j;
141   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
142   for (i = 0; i < am; i++) {
143     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
144     apnz = api[i + 1] - api[i];
145     apa  = ap->a + api[i];
146     PetscCall(PetscArrayzero(apa, apnz));
147     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
148   }
149   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
150   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
151 
152   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
153   /* Always use scalable version since we are in the MPI scalable version */
154   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
155   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
156 
157   C_loc = ptap->C_loc;
158   C_oth = ptap->C_oth;
159 
160   /* add C_loc and Co to to C */
161   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
162 
163   /* C_loc -> C */
164   cm    = C_loc->rmap->N;
165   c_seq = (Mat_SeqAIJ *)C_loc->data;
166   cols  = c_seq->j;
167   vals  = c_seq->a;
168   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
169 
170   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
171   /* when there are no off-processor parts.  */
172   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
173   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
174   /* a table, and other, more complex stuff has to be done. */
175   if (C->assembled) {
176     C->was_assembled = PETSC_TRUE;
177     C->assembled     = PETSC_FALSE;
178   }
179   if (C->was_assembled) {
180     for (i = 0; i < cm; i++) {
181       ncols = c_seq->i[i + 1] - c_seq->i[i];
182       row   = rstart + i;
183       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
184       cols += ncols;
185       vals += ncols;
186     }
187   } else {
188     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
189   }
190   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
191   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
192 
193   /* Co -> C, off-processor part */
194   cm    = C_oth->rmap->N;
195   c_seq = (Mat_SeqAIJ *)C_oth->data;
196   cols  = c_seq->j;
197   vals  = c_seq->a;
198   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
199   for (i = 0; i < cm; i++) {
200     ncols = c_seq->i[i + 1] - c_seq->i[i];
201     row   = p->garray[i];
202     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
203     cols += ncols;
204     vals += ncols;
205   }
206   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
207   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
208 
209   ptap->reuse = MAT_REUSE_MATRIX;
210 
211   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
212   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
217 {
218   Mat_APMPI               *ptap;
219   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
220   MPI_Comm                 comm;
221   PetscMPIInt              size, rank;
222   Mat                      P_loc, P_oth;
223   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
224   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
225   PetscInt                *lnk, i, k, pnz, row, nsend;
226   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
227   PETSC_UNUSED PetscMPIInt icompleted = 0;
228   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
229   const PetscInt          *owners;
230   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
231   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
232   MPI_Request             *swaits, *rwaits;
233   MPI_Status              *sstatus, rstatus;
234   PetscLayout              rowmap;
235   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
236   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
237   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
238   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
239   PetscScalar             *apv;
240   PetscHMapI               ta;
241   MatType                  mtype;
242   const char              *prefix;
243 #if defined(PETSC_USE_INFO)
244   PetscReal apfill;
245 #endif
246 
247   PetscFunctionBegin;
248   MatCheckProduct(Cmpi, 4);
249   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
250   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
251   PetscCallMPI(MPI_Comm_size(comm, &size));
252   PetscCallMPI(MPI_Comm_rank(comm, &rank));
253 
254   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
255 
256   /* create symbolic parallel matrix Cmpi */
257   PetscCall(MatGetType(A, &mtype));
258   PetscCall(MatSetType(Cmpi, mtype));
259 
260   /* create struct Mat_APMPI and attached it to C later */
261   PetscCall(PetscNew(&ptap));
262   ptap->reuse   = MAT_INITIAL_MATRIX;
263   ptap->algType = 0;
264 
265   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
266   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
267   /* get P_loc by taking all local rows of P */
268   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
269 
270   ptap->P_loc = P_loc;
271   ptap->P_oth = P_oth;
272 
273   /* (0) compute Rd = Pd^T, Ro = Po^T  */
274   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
275   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
276 
277   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
278   p_loc = (Mat_SeqAIJ *)P_loc->data;
279   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
280 
281   /* create and initialize a linked list */
282   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
283   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
286 
287   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
288 
289   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290   if (ao) {
291     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
292   } else {
293     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
294   }
295   current_space = free_space;
296   nspacedouble  = 0;
297 
298   PetscCall(PetscMalloc1(am + 1, &api));
299   api[0] = 0;
300   for (i = 0; i < am; i++) {
301     /* diagonal portion: Ad[i,:]*P */
302     ai  = ad->i;
303     pi  = p_loc->i;
304     nzi = ai[i + 1] - ai[i];
305     aj  = ad->j + ai[i];
306     for (j = 0; j < nzi; j++) {
307       row  = aj[j];
308       pnz  = pi[row + 1] - pi[row];
309       Jptr = p_loc->j + pi[row];
310       /* add non-zero cols of P into the sorted linked list lnk */
311       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312     }
313     /* off-diagonal portion: Ao[i,:]*P */
314     if (ao) {
315       ai  = ao->i;
316       pi  = p_oth->i;
317       nzi = ai[i + 1] - ai[i];
318       aj  = ao->j + ai[i];
319       for (j = 0; j < nzi; j++) {
320         row  = aj[j];
321         pnz  = pi[row + 1] - pi[row];
322         Jptr = p_oth->j + pi[row];
323         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324       }
325     }
326     apnz       = lnk[0];
327     api[i + 1] = api[i] + apnz;
328 
329     /* if free space is not available, double the total space in the list */
330     if (current_space->local_remaining < apnz) {
331       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
332       nspacedouble++;
333     }
334 
335     /* Copy data into free space, then initialize lnk */
336     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
337 
338     current_space->array += apnz;
339     current_space->local_used += apnz;
340     current_space->local_remaining -= apnz;
341   }
342   /* Allocate space for apj and apv, initialize apj, and */
343   /* destroy list of free space and other temporary array(s) */
344   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
345   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
346   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347 
348   /* Create AP_loc for reuse */
349   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
350   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351 
352 #if defined(PETSC_USE_INFO)
353   if (ao) {
354     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355   } else {
356     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357   }
358   ptap->AP_loc->info.mallocs           = nspacedouble;
359   ptap->AP_loc->info.fill_ratio_given  = fill;
360   ptap->AP_loc->info.fill_ratio_needed = apfill;
361 
362   if (api[am]) {
363     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
364     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365   } else {
366     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367   }
368 #endif
369 
370   /* (2-1) compute symbolic Co = Ro*AP_loc  */
371   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
372   PetscCall(MatGetOptionsPrefix(A, &prefix));
373   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
374   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
375 
376   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
377   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
378   PetscCall(MatProductSetFill(ptap->C_oth, fill));
379   PetscCall(MatProductSetFromOptions(ptap->C_oth));
380   PetscCall(MatProductSymbolic(ptap->C_oth));
381 
382   /* (3) send coj of C_oth to other processors  */
383   /* determine row ownership */
384   PetscCall(PetscLayoutCreate(comm, &rowmap));
385   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
386   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
387   PetscCall(PetscLayoutSetUp(rowmap));
388   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
389 
390   /* determine the number of messages to send, their lengths */
391   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
392   PetscCall(PetscArrayzero(len_s, size));
393   PetscCall(PetscArrayzero(len_si, size));
394 
395   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
396   coi   = c_oth->i;
397   coj   = c_oth->j;
398   con   = ptap->C_oth->rmap->n;
399   proc  = 0;
400   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
401   for (i = 0; i < con; i++) {
402     while (prmap[i] >= owners[proc + 1]) proc++;
403     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
404     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
405   }
406 
407   len          = 0; /* max length of buf_si[], see (4) */
408   owners_co[0] = 0;
409   nsend        = 0;
410   for (proc = 0; proc < size; proc++) {
411     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
412     if (len_s[proc]) {
413       nsend++;
414       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
415       len += len_si[proc];
416     }
417   }
418 
419   /* determine the number and length of messages to receive for coi and coj  */
420   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
421   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
422 
423   /* post the Irecv and Isend of coj */
424   PetscCall(PetscCommGetNewTag(comm, &tagj));
425   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
426   PetscCall(PetscMalloc1(nsend + 1, &swaits));
427   for (proc = 0, k = 0; proc < size; proc++) {
428     if (!len_s[proc]) continue;
429     i = owners_co[proc];
430     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
431     k++;
432   }
433 
434   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
435   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
436   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
437   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
438   PetscCall(MatProductSetFill(ptap->C_loc, fill));
439 
440   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
441   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
442 
443   PetscCall(MatProductSetFromOptions(ptap->C_loc));
444   PetscCall(MatProductSymbolic(ptap->C_loc));
445 
446   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
447   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
448 
449   /* receives coj are complete */
450   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
451   PetscCall(PetscFree(rwaits));
452   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
453 
454   /* add received column indices into ta to update Crmax */
455   for (k = 0; k < nrecv; k++) { /* k-th received message */
456     Jptr = buf_rj[k];
457     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
458   }
459   PetscCall(PetscHMapIGetSize(ta, &Crmax));
460   PetscCall(PetscHMapIDestroy(&ta));
461 
462   /* (4) send and recv coi */
463   PetscCall(PetscCommGetNewTag(comm, &tagi));
464   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
465   PetscCall(PetscMalloc1(len + 1, &buf_s));
466   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
467   for (proc = 0, k = 0; proc < size; proc++) {
468     if (!len_s[proc]) continue;
469     /* form outgoing message for i-structure:
470          buf_si[0]:                 nrows to be sent
471                [1:nrows]:           row index (global)
472                [nrows+1:2*nrows+1]: i-structure index
473     */
474     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
475     buf_si_i    = buf_si + nrows + 1;
476     buf_si[0]   = nrows;
477     buf_si_i[0] = 0;
478     nrows       = 0;
479     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
480       nzi                 = coi[i + 1] - coi[i];
481       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
482       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
483       nrows++;
484     }
485     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
486     k++;
487     buf_si += len_si[proc];
488   }
489   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
490   PetscCall(PetscFree(rwaits));
491   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
492 
493   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
494   PetscCall(PetscFree(len_ri));
495   PetscCall(PetscFree(swaits));
496   PetscCall(PetscFree(buf_s));
497 
498   /* (5) compute the local portion of Cmpi      */
499   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
500   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
501   current_space = free_space;
502 
503   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
504   for (k = 0; k < nrecv; k++) {
505     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
506     nrows       = *buf_ri_k[k];
507     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
508     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
509   }
510 
511   MatPreallocateBegin(comm, pn, pn, dnz, onz);
512   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
513   for (i = 0; i < pn; i++) {
514     /* add C_loc into Cmpi */
515     nzi  = c_loc->i[i + 1] - c_loc->i[i];
516     Jptr = c_loc->j + c_loc->i[i];
517     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
518 
519     /* add received col data into lnk */
520     for (k = 0; k < nrecv; k++) { /* k-th received message */
521       if (i == *nextrow[k]) {     /* i-th row */
522         nzi  = *(nextci[k] + 1) - *nextci[k];
523         Jptr = buf_rj[k] + *nextci[k];
524         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
525         nextrow[k]++;
526         nextci[k]++;
527       }
528     }
529     nzi = lnk[0];
530 
531     /* copy data into free space, then initialize lnk */
532     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
533     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
534   }
535   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
536   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
537   PetscCall(PetscFreeSpaceDestroy(free_space));
538 
539   /* local sizes and preallocation */
540   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
541   if (P->cmap->bs > 0) {
542     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
543     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
544   }
545   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
546   MatPreallocateEnd(dnz, onz);
547 
548   /* members in merge */
549   PetscCall(PetscFree(id_r));
550   PetscCall(PetscFree(len_r));
551   PetscCall(PetscFree(buf_ri[0]));
552   PetscCall(PetscFree(buf_ri));
553   PetscCall(PetscFree(buf_rj[0]));
554   PetscCall(PetscFree(buf_rj));
555   PetscCall(PetscLayoutDestroy(&rowmap));
556 
557   nout = 0;
558   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
559   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
560   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
561   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
562 
563   /* attach the supporting struct to Cmpi for reuse */
564   Cmpi->product->data    = ptap;
565   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
566   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
567 
568   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
569   Cmpi->assembled        = PETSC_FALSE;
570   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
571   PetscFunctionReturn(PETSC_SUCCESS);
572 }
573 
574 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
575 {
576   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
577   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
578   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
579   PetscInt    pcstart, pcend, column, offset;
580 
581   PetscFunctionBegin;
582   pcstart = P->cmap->rstart;
583   pcstart *= dof;
584   pcend = P->cmap->rend;
585   pcend *= dof;
586   /* diagonal portion: Ad[i,:]*P */
587   ai  = ad->i;
588   nzi = ai[i + 1] - ai[i];
589   aj  = ad->j + ai[i];
590   for (j = 0; j < nzi; j++) {
591     row    = aj[j];
592     offset = row % dof;
593     row /= dof;
594     nzpi = pd->i[row + 1] - pd->i[row];
595     pj   = pd->j + pd->i[row];
596     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
597   }
598   /* off diag P */
599   for (j = 0; j < nzi; j++) {
600     row    = aj[j];
601     offset = row % dof;
602     row /= dof;
603     nzpi = po->i[row + 1] - po->i[row];
604     pj   = po->j + po->i[row];
605     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
606   }
607 
608   /* off diagonal part: Ao[i, :]*P_oth */
609   if (ao) {
610     ai  = ao->i;
611     pi  = p_oth->i;
612     nzi = ai[i + 1] - ai[i];
613     aj  = ao->j + ai[i];
614     for (j = 0; j < nzi; j++) {
615       row       = aj[j];
616       offset    = a->garray[row] % dof;
617       row       = map[row];
618       pnz       = pi[row + 1] - pi[row];
619       p_othcols = p_oth->j + pi[row];
620       for (col = 0; col < pnz; col++) {
621         column = p_othcols[col] * dof + offset;
622         if (column >= pcstart && column < pcend) {
623           PetscCall(PetscHSetIAdd(dht, column));
624         } else {
625           PetscCall(PetscHSetIAdd(oht, column));
626         }
627       }
628     }
629   } /* end if (ao) */
630   PetscFunctionReturn(PETSC_SUCCESS);
631 }
632 
633 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
634 {
635   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
636   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
637   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
638   PetscScalar ra, *aa, *pa;
639 
640   PetscFunctionBegin;
641   pcstart = P->cmap->rstart;
642   pcstart *= dof;
643 
644   /* diagonal portion: Ad[i,:]*P */
645   ai  = ad->i;
646   nzi = ai[i + 1] - ai[i];
647   aj  = ad->j + ai[i];
648   aa  = ad->a + ai[i];
649   for (j = 0; j < nzi; j++) {
650     ra     = aa[j];
651     row    = aj[j];
652     offset = row % dof;
653     row /= dof;
654     nzpi = pd->i[row + 1] - pd->i[row];
655     pj   = pd->j + pd->i[row];
656     pa   = pd->a + pd->i[row];
657     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
658     PetscCall(PetscLogFlops(2.0 * nzpi));
659   }
660   for (j = 0; j < nzi; j++) {
661     ra     = aa[j];
662     row    = aj[j];
663     offset = row % dof;
664     row /= dof;
665     nzpi = po->i[row + 1] - po->i[row];
666     pj   = po->j + po->i[row];
667     pa   = po->a + po->i[row];
668     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
669     PetscCall(PetscLogFlops(2.0 * nzpi));
670   }
671 
672   /* off diagonal part: Ao[i, :]*P_oth */
673   if (ao) {
674     ai  = ao->i;
675     pi  = p_oth->i;
676     nzi = ai[i + 1] - ai[i];
677     aj  = ao->j + ai[i];
678     aa  = ao->a + ai[i];
679     for (j = 0; j < nzi; j++) {
680       row       = aj[j];
681       offset    = a->garray[row] % dof;
682       row       = map[row];
683       ra        = aa[j];
684       pnz       = pi[row + 1] - pi[row];
685       p_othcols = p_oth->j + pi[row];
686       pa        = p_oth->a + pi[row];
687       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
688       PetscCall(PetscLogFlops(2.0 * pnz));
689     }
690   } /* end if (ao) */
691 
692   PetscFunctionReturn(PETSC_SUCCESS);
693 }
694 
695 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
696 
697 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
698 {
699   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
700   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
701   Mat_APMPI      *ptap;
702   PetscHMapIV     hmap;
703   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
704   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
705   PetscInt        offset, ii, pocol;
706   const PetscInt *mappingindices;
707   IS              map;
708 
709   PetscFunctionBegin;
710   MatCheckProduct(C, 4);
711   ptap = (Mat_APMPI *)C->product->data;
712   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
713   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
714 
715   PetscCall(MatZeroEntries(C));
716 
717   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
718   if (ptap->reuse == MAT_REUSE_MATRIX) {
719     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
720     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
721   }
722   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
723 
724   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
725   pon *= dof;
726   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
727   PetscCall(MatGetLocalSize(A, &am, NULL));
728   cmaxr = 0;
729   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
730   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
731   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
732   PetscCall(ISGetIndices(map, &mappingindices));
733   for (i = 0; i < am && pon; i++) {
734     PetscCall(PetscHMapIVClear(hmap));
735     offset = i % dof;
736     ii     = i / dof;
737     nzi    = po->i[ii + 1] - po->i[ii];
738     if (!nzi) continue;
739     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
740     voff = 0;
741     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
742     if (!voff) continue;
743 
744     /* Form C(ii, :) */
745     poj = po->j + po->i[ii];
746     poa = po->a + po->i[ii];
747     for (j = 0; j < nzi; j++) {
748       pocol   = poj[j] * dof + offset;
749       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
750       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
751       for (jj = 0; jj < voff; jj++) {
752         apvaluestmp[jj] = apvalues[jj] * poa[j];
753         /* If the row is empty */
754         if (!c_rmtc[pocol]) {
755           c_rmtjj[jj] = apindices[jj];
756           c_rmtaa[jj] = apvaluestmp[jj];
757           c_rmtc[pocol]++;
758         } else {
759           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
760           if (loc >= 0) { /* hit */
761             c_rmtaa[loc] += apvaluestmp[jj];
762             PetscCall(PetscLogFlops(1.0));
763           } else { /* new element */
764             loc = -(loc + 1);
765             /* Move data backward */
766             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
767               c_rmtjj[kk] = c_rmtjj[kk - 1];
768               c_rmtaa[kk] = c_rmtaa[kk - 1];
769             } /* End kk */
770             c_rmtjj[loc] = apindices[jj];
771             c_rmtaa[loc] = apvaluestmp[jj];
772             c_rmtc[pocol]++;
773           }
774         }
775         PetscCall(PetscLogFlops(voff));
776       } /* End jj */
777     }   /* End j */
778   }     /* End i */
779 
780   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
781   PetscCall(PetscHMapIVDestroy(&hmap));
782 
783   PetscCall(MatGetLocalSize(P, NULL, &pn));
784   pn *= dof;
785   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
786 
787   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
788   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
789   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
790   pcstart = pcstart * dof;
791   pcend   = pcend * dof;
792   cd      = (Mat_SeqAIJ *)(c->A)->data;
793   co      = (Mat_SeqAIJ *)(c->B)->data;
794 
795   cmaxr = 0;
796   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
797   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
798   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
799   for (i = 0; i < am && pn; i++) {
800     PetscCall(PetscHMapIVClear(hmap));
801     offset = i % dof;
802     ii     = i / dof;
803     nzi    = pd->i[ii + 1] - pd->i[ii];
804     if (!nzi) continue;
805     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
806     voff = 0;
807     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
808     if (!voff) continue;
809     /* Form C(ii, :) */
810     pdj = pd->j + pd->i[ii];
811     pda = pd->a + pd->i[ii];
812     for (j = 0; j < nzi; j++) {
813       row = pcstart + pdj[j] * dof + offset;
814       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
815       PetscCall(PetscLogFlops(voff));
816       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
817     }
818   }
819   PetscCall(ISRestoreIndices(map, &mappingindices));
820   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
821   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
822   PetscCall(PetscHMapIVDestroy(&hmap));
823   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
824   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
825   PetscCall(PetscFree2(c_rmtj, c_rmta));
826 
827   /* Add contributions from remote */
828   for (i = 0; i < pn; i++) {
829     row = i + pcstart;
830     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
831   }
832   PetscCall(PetscFree2(c_othj, c_otha));
833 
834   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
835   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
836 
837   ptap->reuse = MAT_REUSE_MATRIX;
838   PetscFunctionReturn(PETSC_SUCCESS);
839 }
840 
841 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
842 {
843   PetscFunctionBegin;
844 
845   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
846   PetscFunctionReturn(PETSC_SUCCESS);
847 }
848 
849 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
850 {
851   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
852   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
853   Mat_APMPI      *ptap;
854   PetscHMapIV     hmap;
855   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
856   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
857   PetscInt        offset, ii, pocol;
858   const PetscInt *mappingindices;
859   IS              map;
860 
861   PetscFunctionBegin;
862   MatCheckProduct(C, 4);
863   ptap = (Mat_APMPI *)C->product->data;
864   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
865   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
866 
867   PetscCall(MatZeroEntries(C));
868 
869   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
870   if (ptap->reuse == MAT_REUSE_MATRIX) {
871     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
872     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
873   }
874   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
875   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
876   pon *= dof;
877   PetscCall(MatGetLocalSize(P, NULL, &pn));
878   pn *= dof;
879 
880   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
881   PetscCall(MatGetLocalSize(A, &am, NULL));
882   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
883   pcstart *= dof;
884   pcend *= dof;
885   cmaxr = 0;
886   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
887   cd = (Mat_SeqAIJ *)(c->A)->data;
888   co = (Mat_SeqAIJ *)(c->B)->data;
889   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
890   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
891   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
892   PetscCall(ISGetIndices(map, &mappingindices));
893   for (i = 0; i < am && (pon || pn); i++) {
894     PetscCall(PetscHMapIVClear(hmap));
895     offset = i % dof;
896     ii     = i / dof;
897     nzi    = po->i[ii + 1] - po->i[ii];
898     dnzi   = pd->i[ii + 1] - pd->i[ii];
899     if (!nzi && !dnzi) continue;
900     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
901     voff = 0;
902     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
903     if (!voff) continue;
904 
905     /* Form remote C(ii, :) */
906     poj = po->j + po->i[ii];
907     poa = po->a + po->i[ii];
908     for (j = 0; j < nzi; j++) {
909       pocol   = poj[j] * dof + offset;
910       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
911       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
912       for (jj = 0; jj < voff; jj++) {
913         apvaluestmp[jj] = apvalues[jj] * poa[j];
914         /* If the row is empty */
915         if (!c_rmtc[pocol]) {
916           c_rmtjj[jj] = apindices[jj];
917           c_rmtaa[jj] = apvaluestmp[jj];
918           c_rmtc[pocol]++;
919         } else {
920           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
921           if (loc >= 0) { /* hit */
922             c_rmtaa[loc] += apvaluestmp[jj];
923             PetscCall(PetscLogFlops(1.0));
924           } else { /* new element */
925             loc = -(loc + 1);
926             /* Move data backward */
927             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
928               c_rmtjj[kk] = c_rmtjj[kk - 1];
929               c_rmtaa[kk] = c_rmtaa[kk - 1];
930             } /* End kk */
931             c_rmtjj[loc] = apindices[jj];
932             c_rmtaa[loc] = apvaluestmp[jj];
933             c_rmtc[pocol]++;
934           }
935         }
936       } /* End jj */
937       PetscCall(PetscLogFlops(voff));
938     } /* End j */
939 
940     /* Form local C(ii, :) */
941     pdj = pd->j + pd->i[ii];
942     pda = pd->a + pd->i[ii];
943     for (j = 0; j < dnzi; j++) {
944       row = pcstart + pdj[j] * dof + offset;
945       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
946       PetscCall(PetscLogFlops(voff));
947       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
948     } /* End j */
949   }   /* End i */
950 
951   PetscCall(ISRestoreIndices(map, &mappingindices));
952   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
953   PetscCall(PetscHMapIVDestroy(&hmap));
954   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
955 
956   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
957   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
958   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
959   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
960   PetscCall(PetscFree2(c_rmtj, c_rmta));
961 
962   /* Add contributions from remote */
963   for (i = 0; i < pn; i++) {
964     row = i + pcstart;
965     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
966   }
967   PetscCall(PetscFree2(c_othj, c_otha));
968 
969   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
970   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
971 
972   ptap->reuse = MAT_REUSE_MATRIX;
973   PetscFunctionReturn(PETSC_SUCCESS);
974 }
975 
976 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
977 {
978   PetscFunctionBegin;
979 
980   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
981   PetscFunctionReturn(PETSC_SUCCESS);
982 }
983 
984 /* TODO: move algorithm selection to MatProductSetFromOptions */
985 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
986 {
987   Mat_APMPI      *ptap;
988   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
989   MPI_Comm        comm;
990   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
991   MatType         mtype;
992   PetscSF         sf;
993   PetscSFNode    *iremote;
994   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
995   const PetscInt *rootdegrees;
996   PetscHSetI      ht, oht, *hta, *hto;
997   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
998   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
999   PetscInt        nalg = 2, alg = 0, offset, ii;
1000   PetscMPIInt     owner;
1001   const PetscInt *mappingindices;
1002   PetscBool       flg;
1003   const char     *algTypes[2] = {"overlapping", "merged"};
1004   IS              map;
1005 
1006   PetscFunctionBegin;
1007   MatCheckProduct(Cmpi, 5);
1008   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1009   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1010 
1011   /* Create symbolic parallel matrix Cmpi */
1012   PetscCall(MatGetLocalSize(P, NULL, &pn));
1013   pn *= dof;
1014   PetscCall(MatGetType(A, &mtype));
1015   PetscCall(MatSetType(Cmpi, mtype));
1016   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1017 
1018   PetscCall(PetscNew(&ptap));
1019   ptap->reuse   = MAT_INITIAL_MATRIX;
1020   ptap->algType = 2;
1021 
1022   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1023   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1024   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1025   /* This equals to the number of offdiag columns in P */
1026   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1027   pon *= dof;
1028   /* offsets */
1029   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1030   /* The number of columns we will send to remote ranks */
1031   PetscCall(PetscMalloc1(pon, &c_rmtc));
1032   PetscCall(PetscMalloc1(pon, &hta));
1033   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1034   PetscCall(MatGetLocalSize(A, &am, NULL));
1035   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1036   /* Create hash table to merge all columns for C(i, :) */
1037   PetscCall(PetscHSetICreate(&ht));
1038 
1039   PetscCall(ISGetIndices(map, &mappingindices));
1040   ptap->c_rmti[0] = 0;
1041   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1042   for (i = 0; i < am && pon; i++) {
1043     /* Form one row of AP */
1044     PetscCall(PetscHSetIClear(ht));
1045     offset = i % dof;
1046     ii     = i / dof;
1047     /* If the off diag is empty, we should not do any calculation */
1048     nzi = po->i[ii + 1] - po->i[ii];
1049     if (!nzi) continue;
1050 
1051     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1052     PetscCall(PetscHSetIGetSize(ht, &htsize));
1053     /* If AP is empty, just continue */
1054     if (!htsize) continue;
1055     /* Form C(ii, :) */
1056     poj = po->j + po->i[ii];
1057     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1058   }
1059 
1060   for (i = 0; i < pon; i++) {
1061     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1062     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1063     c_rmtc[i]           = htsize;
1064   }
1065 
1066   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1067 
1068   for (i = 0; i < pon; i++) {
1069     off = 0;
1070     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1071     PetscCall(PetscHSetIDestroy(&hta[i]));
1072   }
1073   PetscCall(PetscFree(hta));
1074 
1075   PetscCall(PetscMalloc1(pon, &iremote));
1076   for (i = 0; i < pon; i++) {
1077     owner  = 0;
1078     lidx   = 0;
1079     offset = i % dof;
1080     ii     = i / dof;
1081     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1082     iremote[i].index = lidx * dof + offset;
1083     iremote[i].rank  = owner;
1084   }
1085 
1086   PetscCall(PetscSFCreate(comm, &sf));
1087   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1088   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1089   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1090   PetscCall(PetscSFSetFromOptions(sf));
1091   PetscCall(PetscSFSetUp(sf));
1092   /* How many neighbors have contributions to my rows? */
1093   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1094   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1095   rootspacesize = 0;
1096   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1097   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1098   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1099   /* Get information from leaves
1100    * Number of columns other people contribute to my rows
1101    * */
1102   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1103   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1104   PetscCall(PetscFree(c_rmtc));
1105   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1106   /* The number of columns is received for each row */
1107   ptap->c_othi[0]     = 0;
1108   rootspacesize       = 0;
1109   rootspaceoffsets[0] = 0;
1110   for (i = 0; i < pn; i++) {
1111     rcvncols = 0;
1112     for (j = 0; j < rootdegrees[i]; j++) {
1113       rcvncols += rootspace[rootspacesize];
1114       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1115       rootspacesize++;
1116     }
1117     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1118   }
1119   PetscCall(PetscFree(rootspace));
1120 
1121   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1122   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1123   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1124   PetscCall(PetscSFDestroy(&sf));
1125   PetscCall(PetscFree(rootspaceoffsets));
1126 
1127   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1128   nleaves = 0;
1129   for (i = 0; i < pon; i++) {
1130     owner = 0;
1131     ii    = i / dof;
1132     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1133     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1134     for (j = 0; j < sendncols; j++) {
1135       iremote[nleaves].rank    = owner;
1136       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1137     }
1138   }
1139   PetscCall(PetscFree(c_rmtoffsets));
1140   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1141 
1142   PetscCall(PetscSFCreate(comm, &ptap->sf));
1143   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1144   PetscCall(PetscSFSetFromOptions(ptap->sf));
1145   /* One to one map */
1146   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1147 
1148   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1149   PetscCall(PetscHSetICreate(&oht));
1150   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1151   pcstart *= dof;
1152   pcend *= dof;
1153   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1154   for (i = 0; i < pn; i++) {
1155     PetscCall(PetscHSetICreate(&hta[i]));
1156     PetscCall(PetscHSetICreate(&hto[i]));
1157   }
1158   /* Work on local part */
1159   /* 4) Pass 1: Estimate memory for C_loc */
1160   for (i = 0; i < am && pn; i++) {
1161     PetscCall(PetscHSetIClear(ht));
1162     PetscCall(PetscHSetIClear(oht));
1163     offset = i % dof;
1164     ii     = i / dof;
1165     nzi    = pd->i[ii + 1] - pd->i[ii];
1166     if (!nzi) continue;
1167 
1168     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1169     PetscCall(PetscHSetIGetSize(ht, &htsize));
1170     PetscCall(PetscHSetIGetSize(oht, &htosize));
1171     if (!(htsize + htosize)) continue;
1172     /* Form C(ii, :) */
1173     pdj = pd->j + pd->i[ii];
1174     for (j = 0; j < nzi; j++) {
1175       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1176       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1177     }
1178   }
1179 
1180   PetscCall(ISRestoreIndices(map, &mappingindices));
1181 
1182   PetscCall(PetscHSetIDestroy(&ht));
1183   PetscCall(PetscHSetIDestroy(&oht));
1184 
1185   /* Get remote data */
1186   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1187   PetscCall(PetscFree(c_rmtj));
1188 
1189   for (i = 0; i < pn; i++) {
1190     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1191     rdj = c_othj + ptap->c_othi[i];
1192     for (j = 0; j < nzi; j++) {
1193       col = rdj[j];
1194       /* diag part */
1195       if (col >= pcstart && col < pcend) {
1196         PetscCall(PetscHSetIAdd(hta[i], col));
1197       } else { /* off diag */
1198         PetscCall(PetscHSetIAdd(hto[i], col));
1199       }
1200     }
1201     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1202     dnz[i] = htsize;
1203     PetscCall(PetscHSetIDestroy(&hta[i]));
1204     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1205     onz[i] = htsize;
1206     PetscCall(PetscHSetIDestroy(&hto[i]));
1207   }
1208 
1209   PetscCall(PetscFree2(hta, hto));
1210   PetscCall(PetscFree(c_othj));
1211 
1212   /* local sizes and preallocation */
1213   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1214   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1215   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1216   PetscCall(MatSetUp(Cmpi));
1217   PetscCall(PetscFree2(dnz, onz));
1218 
1219   /* attach the supporting struct to Cmpi for reuse */
1220   Cmpi->product->data    = ptap;
1221   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1222   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1223 
1224   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1225   Cmpi->assembled = PETSC_FALSE;
1226   /* pick an algorithm */
1227   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1228   alg = 0;
1229   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1230   PetscOptionsEnd();
1231   switch (alg) {
1232   case 0:
1233     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1234     break;
1235   case 1:
1236     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1237     break;
1238   default:
1239     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1240   }
1241   PetscFunctionReturn(PETSC_SUCCESS);
1242 }
1243 
1244 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1245 {
1246   PetscFunctionBegin;
1247   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1248   PetscFunctionReturn(PETSC_SUCCESS);
1249 }
1250 
1251 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1252 {
1253   Mat_APMPI      *ptap;
1254   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1255   MPI_Comm        comm;
1256   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1257   MatType         mtype;
1258   PetscSF         sf;
1259   PetscSFNode    *iremote;
1260   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1261   const PetscInt *rootdegrees;
1262   PetscHSetI      ht, oht, *hta, *hto, *htd;
1263   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1264   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1265   PetscInt        nalg = 2, alg = 0, offset, ii;
1266   PetscMPIInt     owner;
1267   PetscBool       flg;
1268   const char     *algTypes[2] = {"merged", "overlapping"};
1269   const PetscInt *mappingindices;
1270   IS              map;
1271 
1272   PetscFunctionBegin;
1273   MatCheckProduct(Cmpi, 5);
1274   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1275   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1276 
1277   /* Create symbolic parallel matrix Cmpi */
1278   PetscCall(MatGetLocalSize(P, NULL, &pn));
1279   pn *= dof;
1280   PetscCall(MatGetType(A, &mtype));
1281   PetscCall(MatSetType(Cmpi, mtype));
1282   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1283 
1284   PetscCall(PetscNew(&ptap));
1285   ptap->reuse   = MAT_INITIAL_MATRIX;
1286   ptap->algType = 3;
1287 
1288   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1289   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1290   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1291 
1292   /* This equals to the number of offdiag columns in P */
1293   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1294   pon *= dof;
1295   /* offsets */
1296   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1297   /* The number of columns we will send to remote ranks */
1298   PetscCall(PetscMalloc1(pon, &c_rmtc));
1299   PetscCall(PetscMalloc1(pon, &hta));
1300   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1301   PetscCall(MatGetLocalSize(A, &am, NULL));
1302   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1303   /* Create hash table to merge all columns for C(i, :) */
1304   PetscCall(PetscHSetICreate(&ht));
1305   PetscCall(PetscHSetICreate(&oht));
1306   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1307   for (i = 0; i < pn; i++) {
1308     PetscCall(PetscHSetICreate(&htd[i]));
1309     PetscCall(PetscHSetICreate(&hto[i]));
1310   }
1311 
1312   PetscCall(ISGetIndices(map, &mappingindices));
1313   ptap->c_rmti[0] = 0;
1314   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1315   for (i = 0; i < am && (pon || pn); i++) {
1316     /* Form one row of AP */
1317     PetscCall(PetscHSetIClear(ht));
1318     PetscCall(PetscHSetIClear(oht));
1319     offset = i % dof;
1320     ii     = i / dof;
1321     /* If the off diag is empty, we should not do any calculation */
1322     nzi  = po->i[ii + 1] - po->i[ii];
1323     dnzi = pd->i[ii + 1] - pd->i[ii];
1324     if (!nzi && !dnzi) continue;
1325 
1326     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1327     PetscCall(PetscHSetIGetSize(ht, &htsize));
1328     PetscCall(PetscHSetIGetSize(oht, &htosize));
1329     /* If AP is empty, just continue */
1330     if (!(htsize + htosize)) continue;
1331 
1332     /* Form remote C(ii, :) */
1333     poj = po->j + po->i[ii];
1334     for (j = 0; j < nzi; j++) {
1335       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1336       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1337     }
1338 
1339     /* Form local C(ii, :) */
1340     pdj = pd->j + pd->i[ii];
1341     for (j = 0; j < dnzi; j++) {
1342       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1343       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1344     }
1345   }
1346 
1347   PetscCall(ISRestoreIndices(map, &mappingindices));
1348 
1349   PetscCall(PetscHSetIDestroy(&ht));
1350   PetscCall(PetscHSetIDestroy(&oht));
1351 
1352   for (i = 0; i < pon; i++) {
1353     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1354     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1355     c_rmtc[i]           = htsize;
1356   }
1357 
1358   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1359 
1360   for (i = 0; i < pon; i++) {
1361     off = 0;
1362     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1363     PetscCall(PetscHSetIDestroy(&hta[i]));
1364   }
1365   PetscCall(PetscFree(hta));
1366 
1367   PetscCall(PetscMalloc1(pon, &iremote));
1368   for (i = 0; i < pon; i++) {
1369     owner  = 0;
1370     lidx   = 0;
1371     offset = i % dof;
1372     ii     = i / dof;
1373     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1374     iremote[i].index = lidx * dof + offset;
1375     iremote[i].rank  = owner;
1376   }
1377 
1378   PetscCall(PetscSFCreate(comm, &sf));
1379   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1380   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1381   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1382   PetscCall(PetscSFSetFromOptions(sf));
1383   PetscCall(PetscSFSetUp(sf));
1384   /* How many neighbors have contributions to my rows? */
1385   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1386   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1387   rootspacesize = 0;
1388   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1389   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1390   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1391   /* Get information from leaves
1392    * Number of columns other people contribute to my rows
1393    * */
1394   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1395   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1396   PetscCall(PetscFree(c_rmtc));
1397   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1398   /* The number of columns is received for each row */
1399   ptap->c_othi[0]     = 0;
1400   rootspacesize       = 0;
1401   rootspaceoffsets[0] = 0;
1402   for (i = 0; i < pn; i++) {
1403     rcvncols = 0;
1404     for (j = 0; j < rootdegrees[i]; j++) {
1405       rcvncols += rootspace[rootspacesize];
1406       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1407       rootspacesize++;
1408     }
1409     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1410   }
1411   PetscCall(PetscFree(rootspace));
1412 
1413   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1414   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1415   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1416   PetscCall(PetscSFDestroy(&sf));
1417   PetscCall(PetscFree(rootspaceoffsets));
1418 
1419   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1420   nleaves = 0;
1421   for (i = 0; i < pon; i++) {
1422     owner = 0;
1423     ii    = i / dof;
1424     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1425     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1426     for (j = 0; j < sendncols; j++) {
1427       iremote[nleaves].rank    = owner;
1428       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1429     }
1430   }
1431   PetscCall(PetscFree(c_rmtoffsets));
1432   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1433 
1434   PetscCall(PetscSFCreate(comm, &ptap->sf));
1435   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1436   PetscCall(PetscSFSetFromOptions(ptap->sf));
1437   /* One to one map */
1438   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1439   /* Get remote data */
1440   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1441   PetscCall(PetscFree(c_rmtj));
1442   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1443   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1444   pcstart *= dof;
1445   pcend *= dof;
1446   for (i = 0; i < pn; i++) {
1447     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1448     rdj = c_othj + ptap->c_othi[i];
1449     for (j = 0; j < nzi; j++) {
1450       col = rdj[j];
1451       /* diag part */
1452       if (col >= pcstart && col < pcend) {
1453         PetscCall(PetscHSetIAdd(htd[i], col));
1454       } else { /* off diag */
1455         PetscCall(PetscHSetIAdd(hto[i], col));
1456       }
1457     }
1458     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1459     dnz[i] = htsize;
1460     PetscCall(PetscHSetIDestroy(&htd[i]));
1461     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1462     onz[i] = htsize;
1463     PetscCall(PetscHSetIDestroy(&hto[i]));
1464   }
1465 
1466   PetscCall(PetscFree2(htd, hto));
1467   PetscCall(PetscFree(c_othj));
1468 
1469   /* local sizes and preallocation */
1470   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1471   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1472   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1473   PetscCall(PetscFree2(dnz, onz));
1474 
1475   /* attach the supporting struct to Cmpi for reuse */
1476   Cmpi->product->data    = ptap;
1477   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1478   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1479 
1480   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1481   Cmpi->assembled = PETSC_FALSE;
1482   /* pick an algorithm */
1483   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1484   alg = 0;
1485   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1486   PetscOptionsEnd();
1487   switch (alg) {
1488   case 0:
1489     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1490     break;
1491   case 1:
1492     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1493     break;
1494   default:
1495     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1496   }
1497   PetscFunctionReturn(PETSC_SUCCESS);
1498 }
1499 
1500 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1501 {
1502   PetscFunctionBegin;
1503   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1504   PetscFunctionReturn(PETSC_SUCCESS);
1505 }
1506 
1507 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1508 {
1509   Mat_APMPI               *ptap;
1510   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1511   MPI_Comm                 comm;
1512   PetscMPIInt              size, rank;
1513   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1514   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1515   PetscInt                *lnk, i, k, pnz, row, nsend;
1516   PetscBT                  lnkbt;
1517   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1518   PETSC_UNUSED PetscMPIInt icompleted = 0;
1519   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1520   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1521   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1522   MPI_Request             *swaits, *rwaits;
1523   MPI_Status              *sstatus, rstatus;
1524   PetscLayout              rowmap;
1525   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1526   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1527   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1528   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1529   PetscScalar             *apv;
1530   PetscHMapI               ta;
1531   MatType                  mtype;
1532   const char              *prefix;
1533 #if defined(PETSC_USE_INFO)
1534   PetscReal apfill;
1535 #endif
1536 
1537   PetscFunctionBegin;
1538   MatCheckProduct(Cmpi, 4);
1539   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1540   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1541   PetscCallMPI(MPI_Comm_size(comm, &size));
1542   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1543 
1544   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1545 
1546   /* create symbolic parallel matrix Cmpi */
1547   PetscCall(MatGetType(A, &mtype));
1548   PetscCall(MatSetType(Cmpi, mtype));
1549 
1550   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1552 
1553   /* create struct Mat_APMPI and attached it to C later */
1554   PetscCall(PetscNew(&ptap));
1555   ptap->reuse   = MAT_INITIAL_MATRIX;
1556   ptap->algType = 1;
1557 
1558   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1559   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1560   /* get P_loc by taking all local rows of P */
1561   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1562 
1563   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1564   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1565   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1566 
1567   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1568   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1569   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1570 
1571   /* create and initialize a linked list */
1572   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1573   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1574   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1575   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1576   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1577 
1578   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1579 
1580   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1581   if (ao) {
1582     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1583   } else {
1584     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1585   }
1586   current_space = free_space;
1587   nspacedouble  = 0;
1588 
1589   PetscCall(PetscMalloc1(am + 1, &api));
1590   api[0] = 0;
1591   for (i = 0; i < am; i++) {
1592     /* diagonal portion: Ad[i,:]*P */
1593     ai  = ad->i;
1594     pi  = p_loc->i;
1595     nzi = ai[i + 1] - ai[i];
1596     aj  = ad->j + ai[i];
1597     for (j = 0; j < nzi; j++) {
1598       row  = aj[j];
1599       pnz  = pi[row + 1] - pi[row];
1600       Jptr = p_loc->j + pi[row];
1601       /* add non-zero cols of P into the sorted linked list lnk */
1602       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1603     }
1604     /* off-diagonal portion: Ao[i,:]*P */
1605     if (ao) {
1606       ai  = ao->i;
1607       pi  = p_oth->i;
1608       nzi = ai[i + 1] - ai[i];
1609       aj  = ao->j + ai[i];
1610       for (j = 0; j < nzi; j++) {
1611         row  = aj[j];
1612         pnz  = pi[row + 1] - pi[row];
1613         Jptr = p_oth->j + pi[row];
1614         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1615       }
1616     }
1617     apnz       = lnk[0];
1618     api[i + 1] = api[i] + apnz;
1619     if (ap_rmax < apnz) ap_rmax = apnz;
1620 
1621     /* if free space is not available, double the total space in the list */
1622     if (current_space->local_remaining < apnz) {
1623       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1624       nspacedouble++;
1625     }
1626 
1627     /* Copy data into free space, then initialize lnk */
1628     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1629 
1630     current_space->array += apnz;
1631     current_space->local_used += apnz;
1632     current_space->local_remaining -= apnz;
1633   }
1634   /* Allocate space for apj and apv, initialize apj, and */
1635   /* destroy list of free space and other temporary array(s) */
1636   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1637   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1638   PetscCall(PetscLLDestroy(lnk, lnkbt));
1639 
1640   /* Create AP_loc for reuse */
1641   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1642   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1643 #if defined(PETSC_USE_INFO)
1644   if (ao) {
1645     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1646   } else {
1647     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1648   }
1649   ptap->AP_loc->info.mallocs           = nspacedouble;
1650   ptap->AP_loc->info.fill_ratio_given  = fill;
1651   ptap->AP_loc->info.fill_ratio_needed = apfill;
1652 
1653   if (api[am]) {
1654     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1655     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1656   } else {
1657     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1658   }
1659 #endif
1660 
1661   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1662   PetscCall(MatGetOptionsPrefix(A, &prefix));
1663   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1664   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1665   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1666   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1667   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1668   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1669   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1670   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1671   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1672   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1673   PetscCall(MatProductSymbolic(ptap->C_oth));
1674 
1675   /* (3) send coj of C_oth to other processors  */
1676   /* determine row ownership */
1677   PetscCall(PetscLayoutCreate(comm, &rowmap));
1678   rowmap->n  = pn;
1679   rowmap->bs = 1;
1680   PetscCall(PetscLayoutSetUp(rowmap));
1681   owners = rowmap->range;
1682 
1683   /* determine the number of messages to send, their lengths */
1684   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1685   PetscCall(PetscArrayzero(len_s, size));
1686   PetscCall(PetscArrayzero(len_si, size));
1687 
1688   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1689   coi   = c_oth->i;
1690   coj   = c_oth->j;
1691   con   = ptap->C_oth->rmap->n;
1692   proc  = 0;
1693   for (i = 0; i < con; i++) {
1694     while (prmap[i] >= owners[proc + 1]) proc++;
1695     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1696     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1697   }
1698 
1699   len          = 0; /* max length of buf_si[], see (4) */
1700   owners_co[0] = 0;
1701   nsend        = 0;
1702   for (proc = 0; proc < size; proc++) {
1703     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1704     if (len_s[proc]) {
1705       nsend++;
1706       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1707       len += len_si[proc];
1708     }
1709   }
1710 
1711   /* determine the number and length of messages to receive for coi and coj  */
1712   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1713   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1714 
1715   /* post the Irecv and Isend of coj */
1716   PetscCall(PetscCommGetNewTag(comm, &tagj));
1717   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1718   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1719   for (proc = 0, k = 0; proc < size; proc++) {
1720     if (!len_s[proc]) continue;
1721     i = owners_co[proc];
1722     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1723     k++;
1724   }
1725 
1726   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1727   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1728   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1729   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1730   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1731   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1732   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1733   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1734   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1735   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1736   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1737   PetscCall(MatProductSymbolic(ptap->C_loc));
1738 
1739   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1740 
1741   /* receives coj are complete */
1742   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1743   PetscCall(PetscFree(rwaits));
1744   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1745 
1746   /* add received column indices into ta to update Crmax */
1747   for (k = 0; k < nrecv; k++) { /* k-th received message */
1748     Jptr = buf_rj[k];
1749     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1750   }
1751   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1752   PetscCall(PetscHMapIDestroy(&ta));
1753 
1754   /* (4) send and recv coi */
1755   PetscCall(PetscCommGetNewTag(comm, &tagi));
1756   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1757   PetscCall(PetscMalloc1(len + 1, &buf_s));
1758   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1759   for (proc = 0, k = 0; proc < size; proc++) {
1760     if (!len_s[proc]) continue;
1761     /* form outgoing message for i-structure:
1762          buf_si[0]:                 nrows to be sent
1763                [1:nrows]:           row index (global)
1764                [nrows+1:2*nrows+1]: i-structure index
1765     */
1766     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1767     buf_si_i    = buf_si + nrows + 1;
1768     buf_si[0]   = nrows;
1769     buf_si_i[0] = 0;
1770     nrows       = 0;
1771     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1772       nzi                 = coi[i + 1] - coi[i];
1773       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1774       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1775       nrows++;
1776     }
1777     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1778     k++;
1779     buf_si += len_si[proc];
1780   }
1781   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1782   PetscCall(PetscFree(rwaits));
1783   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1784 
1785   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1786   PetscCall(PetscFree(len_ri));
1787   PetscCall(PetscFree(swaits));
1788   PetscCall(PetscFree(buf_s));
1789 
1790   /* (5) compute the local portion of Cmpi      */
1791   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
1792   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1793   current_space = free_space;
1794 
1795   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1796   for (k = 0; k < nrecv; k++) {
1797     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1798     nrows       = *buf_ri_k[k];
1799     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1800     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1801   }
1802 
1803   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1804   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1805   for (i = 0; i < pn; i++) {
1806     /* add C_loc into Cmpi */
1807     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1808     Jptr = c_loc->j + c_loc->i[i];
1809     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1810 
1811     /* add received col data into lnk */
1812     for (k = 0; k < nrecv; k++) { /* k-th received message */
1813       if (i == *nextrow[k]) {     /* i-th row */
1814         nzi  = *(nextci[k] + 1) - *nextci[k];
1815         Jptr = buf_rj[k] + *nextci[k];
1816         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1817         nextrow[k]++;
1818         nextci[k]++;
1819       }
1820     }
1821     nzi = lnk[0];
1822 
1823     /* copy data into free space, then initialize lnk */
1824     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1825     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1826   }
1827   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1828   PetscCall(PetscLLDestroy(lnk, lnkbt));
1829   PetscCall(PetscFreeSpaceDestroy(free_space));
1830 
1831   /* local sizes and preallocation */
1832   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1833   if (P->cmap->bs > 0) {
1834     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1835     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1836   }
1837   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1838   MatPreallocateEnd(dnz, onz);
1839 
1840   /* members in merge */
1841   PetscCall(PetscFree(id_r));
1842   PetscCall(PetscFree(len_r));
1843   PetscCall(PetscFree(buf_ri[0]));
1844   PetscCall(PetscFree(buf_ri));
1845   PetscCall(PetscFree(buf_rj[0]));
1846   PetscCall(PetscFree(buf_rj));
1847   PetscCall(PetscLayoutDestroy(&rowmap));
1848 
1849   PetscCall(PetscCalloc1(pN, &ptap->apa));
1850 
1851   /* attach the supporting struct to Cmpi for reuse */
1852   Cmpi->product->data    = ptap;
1853   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1854   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1855 
1856   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1857   Cmpi->assembled = PETSC_FALSE;
1858   PetscFunctionReturn(PETSC_SUCCESS);
1859 }
1860 
1861 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1862 {
1863   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1864   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1865   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1866   Mat_APMPI         *ptap;
1867   Mat                AP_loc, C_loc, C_oth;
1868   PetscInt           i, rstart, rend, cm, ncols, row;
1869   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1870   PetscScalar       *apa;
1871   const PetscInt    *cols;
1872   const PetscScalar *vals;
1873 
1874   PetscFunctionBegin;
1875   MatCheckProduct(C, 3);
1876   ptap = (Mat_APMPI *)C->product->data;
1877   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1878   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1879 
1880   PetscCall(MatZeroEntries(C));
1881   /* 1) get R = Pd^T,Ro = Po^T */
1882   if (ptap->reuse == MAT_REUSE_MATRIX) {
1883     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1884     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1885   }
1886 
1887   /* 2) get AP_loc */
1888   AP_loc = ptap->AP_loc;
1889   ap     = (Mat_SeqAIJ *)AP_loc->data;
1890 
1891   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1892   if (ptap->reuse == MAT_REUSE_MATRIX) {
1893     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1894     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1895     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1896   }
1897 
1898   /* 2-2) compute numeric A_loc*P - dominating part */
1899   /* get data from symbolic products */
1900   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1901   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1902   apa = ptap->apa;
1903   api = ap->i;
1904   apj = ap->j;
1905   for (i = 0; i < am; i++) {
1906     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1907     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1908     apnz = api[i + 1] - api[i];
1909     for (j = 0; j < apnz; j++) {
1910       col                 = apj[j + api[i]];
1911       ap->a[j + ap->i[i]] = apa[col];
1912       apa[col]            = 0.0;
1913     }
1914   }
1915   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1916   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1917 
1918   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1919   PetscCall(MatProductNumeric(ptap->C_loc));
1920   PetscCall(MatProductNumeric(ptap->C_oth));
1921   C_loc = ptap->C_loc;
1922   C_oth = ptap->C_oth;
1923 
1924   /* add C_loc and Co to to C */
1925   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1926 
1927   /* C_loc -> C */
1928   cm    = C_loc->rmap->N;
1929   c_seq = (Mat_SeqAIJ *)C_loc->data;
1930   cols  = c_seq->j;
1931   vals  = c_seq->a;
1932 
1933   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1934   /* when there are no off-processor parts.  */
1935   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1936   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1937   /* a table, and other, more complex stuff has to be done. */
1938   if (C->assembled) {
1939     C->was_assembled = PETSC_TRUE;
1940     C->assembled     = PETSC_FALSE;
1941   }
1942   if (C->was_assembled) {
1943     for (i = 0; i < cm; i++) {
1944       ncols = c_seq->i[i + 1] - c_seq->i[i];
1945       row   = rstart + i;
1946       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1947       cols += ncols;
1948       vals += ncols;
1949     }
1950   } else {
1951     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1952   }
1953 
1954   /* Co -> C, off-processor part */
1955   cm    = C_oth->rmap->N;
1956   c_seq = (Mat_SeqAIJ *)C_oth->data;
1957   cols  = c_seq->j;
1958   vals  = c_seq->a;
1959   for (i = 0; i < cm; i++) {
1960     ncols = c_seq->i[i + 1] - c_seq->i[i];
1961     row   = p->garray[i];
1962     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1963     cols += ncols;
1964     vals += ncols;
1965   }
1966 
1967   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1968   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1969 
1970   ptap->reuse = MAT_REUSE_MATRIX;
1971   PetscFunctionReturn(PETSC_SUCCESS);
1972 }
1973 
1974 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1975 {
1976   Mat_Product        *product = C->product;
1977   Mat                 A = product->A, P = product->B;
1978   MatProductAlgorithm alg  = product->alg;
1979   PetscReal           fill = product->fill;
1980   PetscBool           flg;
1981 
1982   PetscFunctionBegin;
1983   /* scalable: do R=P^T locally, then C=R*A*P */
1984   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1985   if (flg) {
1986     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1987     C->ops->productnumeric = MatProductNumeric_PtAP;
1988     goto next;
1989   }
1990 
1991   /* nonscalable: do R=P^T locally, then C=R*A*P */
1992   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1993   if (flg) {
1994     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1995     goto next;
1996   }
1997 
1998   /* allatonce */
1999   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
2000   if (flg) {
2001     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
2002     goto next;
2003   }
2004 
2005   /* allatonce_merged */
2006   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2007   if (flg) {
2008     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2009     goto next;
2010   }
2011 
2012   /* backend general code */
2013   PetscCall(PetscStrcmp(alg, "backend", &flg));
2014   if (flg) {
2015     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2016     PetscFunctionReturn(PETSC_SUCCESS);
2017   }
2018 
2019   /* hypre */
2020 #if defined(PETSC_HAVE_HYPRE)
2021   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2022   if (flg) {
2023     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2024     C->ops->productnumeric = MatProductNumeric_PtAP;
2025     PetscFunctionReturn(PETSC_SUCCESS);
2026   }
2027 #endif
2028   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2029 
2030 next:
2031   C->ops->productnumeric = MatProductNumeric_PtAP;
2032   PetscFunctionReturn(PETSC_SUCCESS);
2033 }
2034