xref: /petsc/src/dm/partitioner/impls/multistage/mspart.c (revision b3881d2ae8531d3d060b1b4136d5482ffefdf2cc)
1 #include <petscsf.h>
2 #include <petscdmplex.h>
3 #include <petsc/private/dmimpl.h>
4 #include <petsc/private/dmpleximpl.h>
5 #include <petsc/private/partitionerimpl.h> /*I "petscpartitioner.h" I*/
6 
7 PetscBool  MSPartitionerCite       = PETSC_FALSE;
8 const char MSPartitionerCitation[] = "@article{PETScMSPart2021,\n"
9                                      "  author  = {Parsani, Matteo and Boukharfane, Radouan and Nolasco, Irving Reyna and Fern{\'a}ndez, David C Del Rey and Zampini, Stefano and Hadri, Bilel and Dalcin, Lisandro},\n"
10                                      "  title   = {High-order accurate entropy-stable discontinuous collocated Galerkin methods with the summation-by-parts property for compressible {CFD} frameworks: Scalable {SSDC} algorithms and flow solver},\n"
11                                      "  journal = {Journal of Computational Physics},\n"
12                                      "  volume  = {424},\n"
13                                      "  pages   = {109844},\n"
14                                      "  year    = {2021}\n"
15                                      "}\n";
16 
17 PetscLogEvent PetscPartitioner_MS_SetUp;
18 PetscLogEvent PetscPartitioner_MS_Stage[PETSCPARTITIONER_MS_NUMSTAGE];
19 
20 typedef struct {
21   PetscInt   levels;
22   MPI_Group *lgroup;
23   /* Need access to the DM in inner stages */
24   PetscInt stage;
25   DM       stagedm;
26   /* Stage partitioners */
27   PetscPartitioner *ppart;
28   /* Diagnostic */
29   PetscInt *edgeCut;
30   /* Target partition weights */
31   PetscBool     view_tpwgs;
32   PetscSection *tpwgs;
33   PetscBool     compute_tpwgs;
34 } PetscPartitioner_MS;
35 
36 const char *const PetscPartitionerMultistageStrategyList[] = {"NODE", "MSECTION", "PetscPartitionerMultistageStrategy", "PETSCPARTITIONER_MS_", NULL};
37 
PetscLCM_Local(void * in,void * out,PetscMPIInt * cnt,MPI_Datatype * datatype)38 static void PetscLCM_Local(void *in, void *out, PetscMPIInt *cnt, MPI_Datatype *datatype)
39 {
40   PetscInt  count = *cnt;
41   PetscInt *xin = (PetscInt *)in, *xout = (PetscInt *)out;
42 
43   PetscFunctionBegin;
44   if (*datatype != MPIU_INT) {
45     (void)(*PetscErrorPrintf)("Can only handle MPIU_INT");
46     PETSCABORT(MPI_COMM_SELF, PETSC_ERR_ARG_WRONG);
47   }
48   for (PetscInt i = 0; i < count; i++) xout[i] = PetscLCM(xin[i], xout[i]);
49   PetscFunctionReturnVoid();
50 }
51 
PetscPartitionerView_Multistage(PetscPartitioner part,PetscViewer viewer)52 static PetscErrorCode PetscPartitionerView_Multistage(PetscPartitioner part, PetscViewer viewer)
53 {
54   PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
55   PetscViewerFormat    format;
56   PetscBool            iascii;
57 
58   PetscFunctionBegin;
59   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
60   PetscCall(PetscViewerGetFormat(viewer, &format));
61   if (iascii) {
62     MPI_Comm  comm;
63     MPI_Group group;
64 
65     PetscCall(PetscViewerASCIIPushTab(viewer));
66     if (!p->stagedm || p->stage == 0) PetscCall(PetscViewerASCIIPrintf(viewer, "Multistage graph partitioner: total stages %" PetscInt_FMT "\n", p->levels));
67     comm = PetscObjectComm((PetscObject)part);
68     PetscCallMPI(MPI_Comm_group(comm, &group));
69     for (PetscInt l = 0; l < p->levels; l++) {
70       PetscPartitioner ppart  = p->ppart[l];
71       MPI_Comm         pcomm  = PetscObjectComm((PetscObject)ppart);
72       MPI_Group        pgroup = MPI_GROUP_EMPTY;
73 
74       PetscCall(PetscViewerASCIIPushTab(viewer));
75       if (l) {
76         if (pcomm != MPI_COMM_NULL) PetscCallMPI(MPI_Comm_group(pcomm, &pgroup));
77       } else {
78         pgroup = p->lgroup[0];
79       }
80 
81       if (l) {
82         IS          is;
83         PetscMPIInt gr, gem = 1;
84         PetscInt    uniq;
85 
86         PetscCallMPI(MPI_Group_size(group, &gr));
87         if (pgroup != MPI_GROUP_EMPTY) {
88           gem = 0;
89           PetscCallMPI(MPI_Group_translate_ranks(pgroup, 1, &gem, group, &gr));
90         }
91         PetscCall(ISCreateStride(PetscObjectComm((PetscObject)part), 1, gr, 1, &is));
92         PetscCall(ISRenumber(is, NULL, &uniq, NULL));
93         PetscCall(ISDestroy(&is));
94         PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gem, 1, MPI_INT, MPI_SUM, PetscObjectComm((PetscObject)part)));
95         if (gem) uniq--;
96         if (!p->stagedm || l == p->stage) PetscCall(PetscViewerASCIIPrintf(viewer, "Stage %" PetscInt_FMT " partitioners (%" PetscInt_FMT " unique groups, %d empty processes)\n", l, uniq, gem));
97       } else {
98         PetscMPIInt psize;
99         PetscCallMPI(MPI_Group_size(pgroup, &psize));
100         if (!p->stagedm || l == p->stage) PetscCall(PetscViewerASCIIPrintf(viewer, "Stage %" PetscInt_FMT " partitioner to %d processes\n", l, psize));
101       }
102       PetscCall(PetscViewerFlush(viewer));
103       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL && (!p->stagedm || l == p->stage)) {
104         PetscViewer pviewer;
105 
106         if (pcomm == MPI_COMM_NULL) pcomm = PETSC_COMM_SELF;
107         PetscCall(PetscViewerGetSubViewer(viewer, pcomm, &pviewer));
108         if (ppart) {
109           PetscMPIInt size, *ranks, *granks;
110           char        tstr[16], strranks[3072]; /* I'm lazy: max 12 chars (including spaces) per rank -> 256 ranks max*/
111 
112           PetscCallMPI(MPI_Group_size(pgroup, &size));
113           PetscCall(PetscMalloc2(size, &ranks, size, &granks));
114           for (PetscMPIInt i = 0; i < size; i++) ranks[i] = i;
115           PetscCallMPI(MPI_Group_translate_ranks(pgroup, size, ranks, group, granks));
116           if (size <= 256) {
117             PetscCall(PetscStrncpy(strranks, "", sizeof(strranks)));
118             for (PetscInt i = 0; i < size; i++) {
119               PetscCall(PetscSNPrintf(tstr, sizeof(tstr), " %d", granks[i]));
120               PetscCall(PetscStrlcat(strranks, tstr, sizeof(strranks)));
121             }
122           } else PetscCall(PetscStrncpy(strranks, " not showing > 256", sizeof(strranks))); /* LCOV_EXCL_LINE */
123           PetscCall(PetscFree2(ranks, granks));
124           if (!l) {
125             PetscCall(PetscViewerASCIIPrintf(pviewer, "Destination ranks:%s\n", strranks));
126             PetscCall(PetscPartitionerView(ppart, pviewer));
127           } else {
128             PetscCall(PetscPartitionerView(ppart, pviewer));
129             PetscCall(PetscViewerASCIIPrintf(pviewer, "  Participating ranks:%s\n", strranks));
130           }
131           if (p->view_tpwgs) PetscCall(PetscSectionView(p->tpwgs[l], pviewer));
132         }
133         PetscCall(PetscViewerRestoreSubViewer(viewer, pcomm, &pviewer));
134       }
135       PetscCall(PetscViewerFlush(viewer));
136 
137       if (l && pgroup != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&pgroup));
138     }
139     for (PetscInt l = 0; l < p->levels; l++) PetscCall(PetscViewerASCIIPopTab(viewer));
140     PetscCall(PetscViewerASCIIPopTab(viewer));
141     PetscCallMPI(MPI_Group_free(&group));
142   }
143   PetscFunctionReturn(PETSC_SUCCESS);
144 }
145 
PetscPartitionerMultistage_CreateStages(MPI_Comm comm,const PetscInt * options,PetscInt * nlevels,MPI_Group * levels[])146 static PetscErrorCode PetscPartitionerMultistage_CreateStages(MPI_Comm comm, const PetscInt *options, PetscInt *nlevels, MPI_Group *levels[])
147 {
148   MPI_Comm     ncomm;
149   MPI_Group   *lgroup;
150   MPI_Group    ggroup, ngroup;
151   PetscMPIInt *ranks, *granks;
152   PetscMPIInt  size, nsize, isize, rank, nrank, i, l, n, m;
153   PetscInt     strategy = options ? options[0] : (PetscInt)PETSCPARTITIONER_MS_STRATEGY_NODE;
154 
155   PetscFunctionBegin;
156   PetscCallMPI(MPI_Comm_size(comm, &size));
157   PetscCallMPI(MPI_Comm_rank(comm, &rank));
158 
159   if (size == 1) {
160     PetscCall(PetscMalloc1(1, &lgroup));
161     PetscCallMPI(MPI_Comm_group(comm, &lgroup[0]));
162     *nlevels = 1;
163     *levels  = lgroup;
164     PetscFunctionReturn(PETSC_SUCCESS);
165   }
166 
167   switch (strategy) {
168   case PETSCPARTITIONER_MS_STRATEGY_NODE:
169     /* create groups (in global rank ordering of comm) for the 2-level partitioner */
170     if (options && options[1] > 0) {
171       PetscMPIInt node_size = (PetscMPIInt)options[1];
172       if (node_size > 1) {
173         PetscMPIInt color;
174         if (options[2]) { /* interleaved */
175           PetscMPIInt ngroups = size / node_size;
176 
177           if (size % node_size) ngroups += 1;
178           color = rank % ngroups;
179         } else {
180           color = rank / node_size;
181         }
182         PetscCallMPI(MPI_Comm_split(comm, color, rank, &ncomm));
183       } else {
184         PetscCallMPI(MPI_Comm_dup(comm, &ncomm));
185       }
186     } else {
187 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
188       PetscCallMPI(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &ncomm));
189 #else
190       /* if users do not specify the node size and MPI_Comm_split_type is not available, defaults to same comm */
191       PetscCallMPI(MPI_Comm_dup(comm, &ncomm));
192 #endif
193     }
194 
195     PetscCallMPI(MPI_Comm_size(ncomm, &nsize));
196     if (nsize == size) { /* one node */
197       PetscCall(PetscMalloc1(1, &lgroup));
198       PetscCallMPI(MPI_Comm_group(ncomm, &lgroup[0]));
199       PetscCallMPI(MPI_Comm_free(&ncomm));
200 
201       *nlevels = 1;
202       *levels  = lgroup;
203       break;
204     }
205 
206     PetscCall(PetscMalloc1(2, &lgroup));
207 
208     /* intranode group (in terms of global rank) */
209     PetscCall(PetscMalloc2(size, &ranks, nsize, &granks));
210     for (i = 0; i < nsize; i++) ranks[i] = i;
211     PetscCallMPI(MPI_Comm_group(comm, &ggroup));
212     PetscCallMPI(MPI_Comm_group(ncomm, &ngroup));
213     PetscCallMPI(MPI_Group_translate_ranks(ngroup, nsize, ranks, ggroup, granks));
214     PetscCallMPI(MPI_Group_incl(ggroup, nsize, granks, &lgroup[1]));
215 
216     /* internode group (master processes on the nodes only)
217        this group must be specified by all processes in the comm
218        we need to gather those master ranks */
219     PetscCallMPI(MPI_Group_rank(ngroup, &nrank));
220     granks[0] = !nrank ? rank : -1;
221     PetscCallMPI(MPI_Allgather(granks, 1, MPI_INT, ranks, 1, MPI_INT, comm));
222     for (i = 0, isize = 0; i < size; i++)
223       if (ranks[i] >= 0) ranks[isize++] = ranks[i];
224     PetscCallMPI(MPI_Group_incl(ggroup, isize, ranks, &lgroup[0]));
225 
226     PetscCall(PetscFree2(ranks, granks));
227     PetscCallMPI(MPI_Group_free(&ggroup));
228     PetscCallMPI(MPI_Group_free(&ngroup));
229     PetscCallMPI(MPI_Comm_free(&ncomm));
230 
231     *nlevels = 2;
232     *levels  = lgroup;
233     break;
234 
235   case PETSCPARTITIONER_MS_STRATEGY_MSECTION:
236     /* recursive m-section (m=2 bisection) */
237     m = options ? (PetscMPIInt)options[1] : 2;
238     PetscCheck(m >= 2, comm, PETSC_ERR_SUP, "Invalid split %d, must be greater than one", m);
239     l = 0;
240     n = 1;
241     while (n <= size) {
242       l++;
243       n *= m;
244     }
245     l -= 1;
246     n /= m;
247 
248     PetscCheck(l != 0, comm, PETSC_ERR_SUP, "Invalid split %d with communicator size %d", m, size);
249     PetscCall(PetscMalloc1(l, &lgroup));
250     for (i = 1; i < l; i++) lgroup[i] = MPI_GROUP_EMPTY;
251 
252     if (l > 1) {
253       PetscMPIInt rem = size - n;
254       PetscCall(PetscMalloc1((m + rem), &ranks));
255       for (i = 0; i < m; i++) ranks[i] = i * (n / m);
256       for (i = 0; i < rem; i++) ranks[i + m] = n + i;
257       PetscCallMPI(MPI_Comm_group(comm, &ggroup));
258       PetscCallMPI(MPI_Group_incl(ggroup, m + rem, ranks, &lgroup[0]));
259       if (rank < n) {
260         for (i = 1; i < l; i++) {
261           PetscMPIInt inc = (PetscMPIInt)PetscPowInt(m, l - i - 1);
262           if (rank % inc == 0) {
263             PetscMPIInt anch = (rank - rank % (inc * m)), r;
264             for (r = 0; r < m; r++) ranks[r] = anch + r * inc;
265             PetscCallMPI(MPI_Group_incl(ggroup, m, ranks, &lgroup[i]));
266           }
267         }
268       }
269       PetscCall(PetscFree(ranks));
270       PetscCallMPI(MPI_Group_free(&ggroup));
271     } else {
272       PetscCallMPI(MPI_Comm_group(comm, &lgroup[0]));
273     }
274 
275     *nlevels = l;
276     *levels  = lgroup;
277     break;
278 
279   default:
280     SETERRQ(comm, PETSC_ERR_SUP, "Not implemented"); /* LCOV_EXCL_LINE */
281   }
282   PetscFunctionReturn(PETSC_SUCCESS);
283 }
284 
PetscPartitionerMultistage_DestroyStages(PetscInt nstages,MPI_Group * groups[])285 static PetscErrorCode PetscPartitionerMultistage_DestroyStages(PetscInt nstages, MPI_Group *groups[])
286 {
287   PetscFunctionBegin;
288   for (PetscInt l = 0; l < nstages; l++) {
289     MPI_Group group = (*groups)[l];
290     if (group != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&group));
291   }
292   PetscCall(PetscFree(*groups));
293   PetscFunctionReturn(PETSC_SUCCESS);
294 }
295 
PetscPartitionerReset_Multistage(PetscPartitioner part)296 static PetscErrorCode PetscPartitionerReset_Multistage(PetscPartitioner part)
297 {
298   PetscPartitioner_MS *p       = (PetscPartitioner_MS *)part->data;
299   PetscInt             nstages = p->levels;
300 
301   PetscFunctionBegin;
302   p->levels = 0;
303   PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &p->lgroup));
304   for (PetscInt l = 0; l < nstages; l++) {
305     PetscCall(PetscPartitionerDestroy(&p->ppart[l]));
306     PetscCall(PetscSectionDestroy(&p->tpwgs[l]));
307   }
308   PetscCall(PetscFree(p->ppart));
309   PetscCall(PetscFree(p->tpwgs));
310   PetscCall(PetscFree(p->edgeCut));
311   PetscFunctionReturn(PETSC_SUCCESS);
312 }
313 
PetscPartitionerDestroy_Multistage(PetscPartitioner part)314 static PetscErrorCode PetscPartitionerDestroy_Multistage(PetscPartitioner part)
315 {
316   PetscFunctionBegin;
317   PetscCall(PetscPartitionerReset_Multistage(part));
318   PetscCall(PetscFree(part->data));
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 /*@
323   PetscPartitionerMultistageSetStages - Sets stages for the partitioning
324 
325   Collective
326 
327   Input Parameters:
328 + part   - the `PetscPartitioner` object.
329 . levels - the number of stages.
330 - lgroup - array of `MPI_Group`s for each stage.
331 
332   Level: advanced
333 
334   Notes:
335   `MPI_Comm_create(comm, lgroup[l], &lcomm)` is used to compute the communicator for the stage partitioner at level `l`.
336   The groups must be specified in the process numbering of the partitioner communicator.
337   `lgroup[0]` must be collectively specified and it must represent a proper subset of the communicator associated with the original partitioner.
338   For each level, ranks can be listed in one group only (but they can be listed on different levels)
339 
340 .seealso: `PetscPartitionerSetType()`, `PetscPartitionerDestroy()`, `PETSCPARTITIONERMULTISTAGE`
341 @*/
PetscPartitionerMultistageSetStages(PetscPartitioner part,PetscInt levels,MPI_Group lgroup[])342 PetscErrorCode PetscPartitionerMultistageSetStages(PetscPartitioner part, PetscInt levels, MPI_Group lgroup[])
343 {
344   PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
345   MPI_Comm             comm;
346   MPI_Group            wgroup;
347   PetscMPIInt        **lparts = NULL;
348   PetscMPIInt          rank, size;
349   PetscBool            touched, isms;
350 
351   PetscFunctionBegin;
352   PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
353   PetscValidLogicalCollectiveInt(part, levels, 2);
354   PetscCheck(levels >= 0, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Number of levels must be non-negative");
355   if (levels) PetscAssertPointer(lgroup, 3);
356   PetscCall(PetscObjectTypeCompare((PetscObject)part, PETSCPARTITIONERMULTISTAGE, &isms));
357   if (!isms) PetscFunctionReturn(PETSC_SUCCESS);
358   PetscCall(PetscLogEventBegin(PetscPartitioner_MS_SetUp, part, 0, 0, 0));
359 
360   PetscCall(PetscPartitionerReset_Multistage(part));
361 
362   PetscCall(PetscObjectGetComm((PetscObject)part, &comm));
363   PetscCallMPI(MPI_Comm_group(comm, &wgroup));
364   PetscCallMPI(MPI_Comm_rank(comm, &rank));
365   PetscCallMPI(MPI_Comm_size(comm, &size));
366 
367   p->levels = levels;
368   PetscCall(PetscCalloc1(p->levels, &p->ppart));
369   PetscCall(PetscCalloc1(p->levels, &p->lgroup));
370   PetscCall(PetscCalloc1(p->levels, &p->tpwgs));
371   PetscCall(PetscCalloc1(p->levels, &p->edgeCut));
372 
373   /* support for target partition weights */
374   touched = PETSC_FALSE;
375   if (p->compute_tpwgs) PetscCall(PetscMalloc1(p->levels, &lparts));
376 
377   for (PetscInt l = 0; l < p->levels; l++) {
378     const char *prefix;
379     char        aprefix[256];
380     MPI_Comm    lcomm;
381 
382     if (l) { /* let MPI complain/hang if the user did not specify the groups properly */
383       PetscCallMPI(MPI_Comm_create(comm, lgroup[l], &lcomm));
384     } else { /* in debug mode, we check that the initial group must be consistently (collectively) specified on comm */
385 #if defined(PETSC_USE_DEBUG)
386       MPI_Group    group, igroup = lgroup[0];
387       PetscMPIInt *ranks, *granks;
388       PetscMPIInt  b[2], b2[2], csize, gsize;
389 
390       PetscCallMPI(MPI_Group_size(igroup, &gsize));
391       b[0] = -gsize;
392       b[1] = +gsize;
393       PetscCallMPI(MPIU_Allreduce(b, b2, 2, MPI_INT, MPI_MAX, comm));
394       PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Initial group must be collectively specified");
395       PetscCallMPI(MPI_Comm_group(comm, &group));
396       PetscCallMPI(MPI_Group_size(group, &csize));
397       PetscCall(PetscMalloc2(gsize, &ranks, (csize * gsize), &granks));
398       for (PetscMPIInt i = 0; i < gsize; i++) granks[i] = i;
399       PetscCallMPI(MPI_Group_translate_ranks(igroup, gsize, granks, group, ranks));
400       PetscCallMPI(MPI_Group_free(&group));
401       PetscCallMPI(MPI_Allgather(ranks, gsize, MPI_INT, granks, gsize, MPI_INT, comm));
402       for (PetscMPIInt i = 0; i < gsize; i++) {
403         PetscMPIInt chkr = granks[i];
404         for (PetscMPIInt j = 0; j < csize; j++) {
405           PetscMPIInt shft = j * gsize + i;
406           PetscCheck(chkr == granks[shft], comm, PETSC_ERR_ARG_WRONG, "Initial group must be collectively specified");
407         }
408       }
409       PetscCall(PetscFree2(ranks, granks));
410 #endif
411       lcomm = comm;
412     }
413     PetscCall(PetscObjectGetOptionsPrefix((PetscObject)part, &prefix));
414     if (lcomm != MPI_COMM_NULL) {
415       /* MPI_Group_dup */
416       PetscCallMPI(MPI_Group_union(lgroup[l], MPI_GROUP_EMPTY, &p->lgroup[l]));
417       PetscCall(PetscPartitionerCreate(lcomm, &p->ppart[l]));
418       PetscCall(PetscObjectSetOptionsPrefix((PetscObject)p->ppart[l], prefix));
419       PetscCall(PetscSNPrintf(aprefix, sizeof(aprefix), "petscpartitioner_multistage_levels_%" PetscInt_FMT "_", l));
420       PetscCall(PetscObjectAppendOptionsPrefix((PetscObject)p->ppart[l], aprefix));
421     } else {
422       PetscCheck(l != 0, PetscObjectComm((PetscObject)part), PETSC_ERR_USER, "Invalid first group");
423       p->lgroup[l] = MPI_GROUP_EMPTY;
424     }
425     if (lcomm != comm && lcomm != MPI_COMM_NULL) PetscCallMPI(MPI_Comm_free(&lcomm));
426 
427     /* compute number of partitions per level and detect if a process is part of the process (at any level) */
428     if (p->compute_tpwgs) {
429       PetscMPIInt gsize;
430       PetscCall(PetscMalloc1(size, &lparts[l]));
431       PetscCallMPI(MPI_Group_size(p->lgroup[l], &gsize));
432       if (!l) {
433         PetscMPIInt tr;
434         PetscCallMPI(MPI_Group_translate_ranks(wgroup, 1, &rank, p->lgroup[0], &tr));
435         if (tr == MPI_UNDEFINED) gsize = 0;
436       }
437       if (touched && !gsize) gsize = 1;
438       PetscCallMPI(MPI_Allgather(&gsize, 1, MPI_INT, lparts[l], 1, MPI_INT, comm));
439       if (lparts[l][rank]) touched = PETSC_TRUE;
440     }
441   }
442 
443   /* determine weights (bottom-up) */
444   if (p->compute_tpwgs) {
445     PetscMPIInt *tranks;
446     PetscInt    *lwgts, wgt;
447     MPI_Op       MPIU_LCM; /* XXX this is actually recreated at every setup */
448 
449     PetscCall(PetscMalloc1((2 * size), &tranks));
450 
451     /* we need to compute the least common multiple across processes */
452     PetscCallMPI(MPI_Op_create(PetscLCM_Local, 1, &MPIU_LCM));
453 
454     /* final target has to have all ones as weights (if the process gets touched) */
455     wgt = touched ? 1 : 0;
456     PetscCall(PetscMalloc1(size, &lwgts));
457     PetscCallMPI(MPI_Allgather(&wgt, 1, MPIU_INT, lwgts, 1, MPIU_INT, comm));
458 
459     /* now go up the hierarchy and populate the PetscSection describing the partition weights */
460     for (PetscInt l = p->levels - 1; l >= 0; l--) {
461       MPI_Comm    pcomm;
462       MPI_Group   igroup;
463       PetscMPIInt isize, isizer = 0;
464       PetscInt    a, b, wgtr;
465       PetscMPIInt gsize;
466       PetscBool   usep = PETSC_FALSE;
467 
468       if (p->ppart[l]) {
469         PetscCall(PetscObjectGetComm((PetscObject)p->ppart[l], &pcomm));
470       } else {
471         pcomm = PETSC_COMM_SELF;
472       }
473 
474       PetscCallMPI(MPI_Group_size(p->lgroup[l], &gsize));
475       if (gsize) {
476         usep = PETSC_TRUE;
477         for (PetscMPIInt i = 0; i < gsize; i++) tranks[i] = i;
478         PetscCallMPI(MPI_Group_translate_ranks(p->lgroup[l], gsize, tranks, wgroup, tranks + size));
479       } else gsize = lparts[l][rank];
480       PetscCall(PetscFree(lparts[l]));
481       PetscCall(PetscSectionCreate(pcomm, &p->tpwgs[l]));
482       PetscCall(PetscObjectSetName((PetscObject)p->tpwgs[l], "Target partition weights"));
483       PetscCall(PetscSectionSetChart(p->tpwgs[l], 0, gsize));
484 
485       if (usep) {
486         PetscMPIInt *tr = tranks + size;
487         for (PetscMPIInt i = 0; i < gsize; i++) PetscCall(PetscSectionSetDof(p->tpwgs[l], i, lwgts[tr[i]]));
488       } else if (gsize) {
489         PetscCall(PetscSectionSetDof(p->tpwgs[l], 0, 1));
490       }
491       PetscCall(PetscSectionSetUp(p->tpwgs[l]));
492       if (!l) break;
493 
494       /* determine number of processes shared by two consecutive levels */
495       PetscCallMPI(MPI_Group_intersection(p->lgroup[l], p->lgroup[l - 1], &igroup));
496       PetscCallMPI(MPI_Group_size(igroup, &isize));
497       PetscCallMPI(MPI_Group_free(&igroup));
498       if (!p->ppart[l] && touched) isize = 1; /* if this process gets touched, needs to propagate its data from one level to the other */
499 
500       /* reduce on level partitioner comm the size of the max size of the igroup */
501       PetscCallMPI(MPIU_Allreduce(&isize, &isizer, 1, MPI_INT, MPI_MAX, pcomm));
502 
503       /* sum previously computed partition weights on the level comm */
504       wgt  = lwgts[rank];
505       wgtr = wgt;
506       PetscCallMPI(MPIU_Allreduce(&wgt, &wgtr, 1, MPIU_INT, MPI_SUM, pcomm));
507 
508       /* partition weights are given with integers; to properly compute these and be able to propagate them to the next level,
509          we need to compute the least common multiple of isizer across the global comm */
510       a = isizer ? isizer : 1;
511       b = a;
512       PetscCallMPI(MPIU_Allreduce(&a, &b, 1, MPIU_INT, MPIU_LCM, comm));
513 
514       /* finally share this process weight with all the other processes */
515       wgt = isizer ? (b * wgtr) / isizer : 0;
516       PetscCallMPI(MPI_Allgather(&wgt, 1, MPIU_INT, lwgts, 1, MPIU_INT, comm));
517     }
518     PetscCall(PetscFree(lwgts));
519     PetscCall(PetscFree(tranks));
520     PetscCall(PetscFree(lparts));
521     PetscCallMPI(MPI_Op_free(&MPIU_LCM));
522   }
523   PetscCallMPI(MPI_Group_free(&wgroup));
524   PetscCall(PetscLogEventEnd(PetscPartitioner_MS_SetUp, part, 0, 0, 0));
525   PetscFunctionReturn(PETSC_SUCCESS);
526 }
527 
PetscPartitionerMultistageGetStages_Multistage(PetscPartitioner part,PetscInt * levels,MPI_Group * lgroup[])528 PetscErrorCode PetscPartitionerMultistageGetStages_Multistage(PetscPartitioner part, PetscInt *levels, MPI_Group *lgroup[])
529 {
530   PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
531 
532   PetscFunctionBegin;
533   PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
534   PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
535   if (levels) *levels = p->levels;
536   if (lgroup) *lgroup = p->lgroup;
537   PetscFunctionReturn(PETSC_SUCCESS);
538 }
539 
PetscPartitionerMultistageSetStage_Multistage(PetscPartitioner part,PetscInt stage,PetscObject odm)540 PetscErrorCode PetscPartitionerMultistageSetStage_Multistage(PetscPartitioner part, PetscInt stage, PetscObject odm)
541 {
542   DM                   dm = (DM)odm;
543   PetscPartitioner_MS *p  = (PetscPartitioner_MS *)part->data;
544 
545   PetscFunctionBegin;
546   PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
547   PetscCheck(p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ORDER, "Number of stages not set yet");
548   PetscCheck(stage >= 0 && stage < p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Invalid stage index %" PetscInt_FMT ", not in [0,%" PetscInt_FMT ")", stage, p->levels);
549 
550   p->stage = stage;
551   PetscCall(PetscObjectReference((PetscObject)dm));
552   PetscCall(DMDestroy(&p->stagedm));
553   p->stagedm = dm;
554   PetscFunctionReturn(PETSC_SUCCESS);
555 }
556 
PetscPartitionerMultistageGetStage_Multistage(PetscPartitioner part,PetscInt * stage,PetscObject * odm)557 PetscErrorCode PetscPartitionerMultistageGetStage_Multistage(PetscPartitioner part, PetscInt *stage, PetscObject *odm)
558 {
559   PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
560 
561   PetscFunctionBegin;
562   PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
563   if (stage) *stage = p->stage;
564   if (odm) *odm = (PetscObject)p->stagedm;
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
PetscPartitionerSetUp_Multistage(PetscPartitioner part)568 static PetscErrorCode PetscPartitionerSetUp_Multistage(PetscPartitioner part)
569 {
570   PetscPartitioner_MS *p    = (PetscPartitioner_MS *)part->data;
571   MPI_Comm             comm = PetscObjectComm((PetscObject)part);
572   PetscInt             nstages;
573   MPI_Group           *groups;
574 
575   PetscFunctionBegin;
576   if (p->levels) PetscFunctionReturn(PETSC_SUCCESS);
577   PetscCall(PetscPartitionerMultistage_CreateStages(comm, NULL, &nstages, &groups));
578   PetscCall(PetscPartitionerMultistageSetStages(part, nstages, groups));
579   PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &groups));
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 
583 /* targetSection argument unused, target partition weights are computed internally */
PetscPartitionerPartition_Multistage(PetscPartitioner part,PetscInt nparts,PetscInt numVertices,PetscInt start[],PetscInt adjacency[],PetscSection vertSection,PetscSection edgeSection,PETSC_UNUSED PetscSection targetSection,PetscSection partSection,IS * partition)584 static PetscErrorCode PetscPartitionerPartition_Multistage(PetscPartitioner part, PetscInt nparts, PetscInt numVertices, PetscInt start[], PetscInt adjacency[], PetscSection vertSection, PetscSection edgeSection, PETSC_UNUSED PetscSection targetSection, PetscSection partSection, IS *partition)
585 {
586   PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
587   PetscPartitioner     ppart;
588   PetscSection         ppartSection = NULL;
589   IS                   lpartition;
590   const PetscInt      *idxs;
591   MPI_Comm             comm, pcomm;
592   MPI_Group            group, lgroup, pgroup;
593   PetscInt            *pstart, *padjacency;
594   PetscInt             nid, i, pedgeCut;
595   PetscMPIInt          sameComm, pparts;
596   PetscBool            freeadj = PETSC_FALSE;
597 
598   PetscFunctionBegin;
599   PetscCheck(p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ORDER, "Number of stages not set yet");
600   PetscCheck(p->stage >= 0 && p->stage < p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Invalid stage index %" PetscInt_FMT ", not in [0,%" PetscInt_FMT ")", p->stage, p->levels);
601   PetscCall(PetscLogEventBegin(PetscPartitioner_MS_Stage[PetscMin(p->stage, PETSCPARTITIONER_MS_MAXSTAGE)], part, 0, 0, 0));
602 
603   /* Group for current stage (size of the group defines number of "local" partitions) */
604   lgroup = p->lgroup[p->stage];
605   PetscCallMPI(MPI_Group_size(lgroup, &pparts));
606 
607   /* Current stage partitioner */
608   ppart = p->ppart[p->stage];
609   if (ppart) {
610     PetscCall(PetscObjectGetComm((PetscObject)ppart, &pcomm));
611     PetscCallMPI(MPI_Comm_group(pcomm, &pgroup));
612   } else {
613     pcomm  = PETSC_COMM_SELF;
614     pgroup = MPI_GROUP_EMPTY;
615     pparts = -1;
616   }
617 
618   /* Global comm of partitioner */
619   PetscCall(PetscObjectGetComm((PetscObject)part, &comm));
620   PetscCallMPI(MPI_Comm_group(comm, &group));
621 
622   /* Get adjacency */
623   PetscCallMPI(MPI_Group_compare(group, pgroup, &sameComm));
624   if (sameComm != MPI_UNEQUAL) {
625     pstart     = start;
626     padjacency = adjacency;
627   } else { /* restrict to partitioner comm */
628     ISLocalToGlobalMapping l2g, g2l;
629     IS                     gid, rid;
630     const PetscInt        *idxs1;
631     PetscInt              *idxs2;
632     PetscInt               cStart, cEnd, cum;
633     DM                     dm = p->stagedm;
634     PetscSF                sf;
635 
636     PetscCheck(dm, PetscObjectComm((PetscObject)part), PETSC_ERR_PLIB, "Missing stage DM");
637     PetscCall(DMGetPointSF(dm, &sf));
638     PetscCall(DMPlexGetHeightStratum(dm, part->height, &cStart, &cEnd));
639     PetscCall(DMPlexCreateNumbering_Plex(dm, cStart, cEnd, part->height, NULL, sf, &gid));
640     /* filter overlapped local cells (if any) */
641     PetscCall(ISGetIndices(gid, &idxs1));
642     PetscCall(ISGetLocalSize(gid, &cum));
643     PetscCall(PetscMalloc1(cum, &idxs2));
644     for (i = cStart, cum = 0; i < cEnd; i++) {
645       if (idxs1[i - cStart] < 0) continue;
646       idxs2[cum++] = idxs1[i - cStart];
647     }
648     PetscCall(ISRestoreIndices(gid, &idxs1));
649 
650     PetscCheck(numVertices == cum, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected %" PetscInt_FMT " != %" PetscInt_FMT, numVertices, cum);
651     PetscCall(ISDestroy(&gid));
652 
653     /*
654        g2l from full numbering to local numbering
655        l2g from local numbering to restricted numbering
656     */
657     PetscCall(ISCreateGeneral(pcomm, numVertices, idxs2, PETSC_OWN_POINTER, &gid));
658     PetscCall(ISRenumber(gid, NULL, NULL, &rid));
659     PetscCall(ISLocalToGlobalMappingCreateIS(gid, &g2l));
660     PetscCall(ISLocalToGlobalMappingSetType(g2l, ISLOCALTOGLOBALMAPPINGHASH));
661     PetscCall(ISLocalToGlobalMappingCreateIS(rid, &l2g));
662     PetscCall(ISDestroy(&gid));
663     PetscCall(ISDestroy(&rid));
664 
665     PetscCall(PetscMalloc2(numVertices + 1, &pstart, start[numVertices], &padjacency));
666     pstart[0] = 0;
667     for (i = 0; i < numVertices; i++) {
668       PetscCall(ISGlobalToLocalMappingApply(g2l, IS_GTOLM_DROP, start[i + 1] - start[i], adjacency + start[i], &pstart[i + 1], padjacency + pstart[i]));
669       PetscCall(ISLocalToGlobalMappingApply(l2g, pstart[i + 1], padjacency + pstart[i], padjacency + pstart[i]));
670       pstart[i + 1] += pstart[i];
671     }
672     PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
673     PetscCall(ISLocalToGlobalMappingDestroy(&g2l));
674 
675     freeadj = PETSC_TRUE;
676   }
677 
678   /* Compute partitioning */
679   pedgeCut = 0;
680   PetscCall(PetscSectionCreate(pcomm, &ppartSection));
681   if (ppart) {
682     PetscMPIInt prank;
683 
684     PetscCheck(ppart->ops->partition, PetscObjectComm((PetscObject)ppart), PETSC_ERR_ARG_WRONGSTATE, "PetscPartitioner has no partitioning method on stage %" PetscInt_FMT, p->stage);
685     PetscCall(PetscPartitionerPartition(ppart, pparts, numVertices, pstart, padjacency, vertSection, edgeSection, p->tpwgs[p->stage], ppartSection, &lpartition));
686     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)ppart), &prank));
687     if (!prank) pedgeCut = ppart->edgeCut; /* only the master rank will reduce */
688   } else {                                 /* not participating */
689     PetscCall(ISCreateStride(PETSC_COMM_SELF, numVertices, 0, 1, &lpartition));
690     pparts = numVertices > 0 ? 1 : 0;
691   }
692   if (freeadj) PetscCall(PetscFree2(pstart, padjacency));
693 
694   /* Init final partition (output) */
695   PetscCall(PetscSectionReset(partSection));
696   PetscCall(PetscSectionSetChart(partSection, 0, nparts));
697 
698   /* We need to map the section back to the global comm numbering */
699   for (i = 0; i < pparts; i++) {
700     PetscInt    dof;
701     PetscMPIInt mp, mpt;
702 
703     if (lgroup != MPI_GROUP_EMPTY) {
704       PetscCall(PetscMPIIntCast(i, &mp));
705       PetscCall(PetscSectionGetDof(ppartSection, i, &dof));
706       PetscCallMPI(MPI_Group_translate_ranks(lgroup, 1, &mp, group, &mpt));
707     } else {
708       PetscCheck(pparts == 1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Unexpected pparts %d", pparts);
709       PetscCallMPI(MPI_Comm_rank(comm, &mpt));
710       PetscCall(ISGetLocalSize(lpartition, &dof));
711     }
712     PetscCall(PetscSectionSetDof(partSection, mpt, dof));
713   }
714   PetscCall(PetscSectionSetUp(partSection));
715   PetscCall(PetscSectionDestroy(&ppartSection));
716 
717   /* No need to translate the "partition" output, as it is in local cell numbering
718      we only change the comm of the index set */
719   PetscCall(ISGetIndices(lpartition, &idxs));
720   PetscCall(ISGetLocalSize(lpartition, &nid));
721   PetscCall(ISCreateGeneral(comm, nid, idxs, PETSC_COPY_VALUES, partition));
722   PetscCall(ISRestoreIndices(lpartition, &idxs));
723   PetscCall(ISDestroy(&lpartition));
724 
725   PetscCall(PetscSectionViewFromOptions(partSection, (PetscObject)part, "-petscpartitioner_multistage_partition_view"));
726   PetscCall(ISViewFromOptions(*partition, (PetscObject)part, "-petscpartitioner_multistage_partition_view"));
727 
728   /* Update edge-cut */
729   p->edgeCut[p->stage] = pedgeCut;
730   for (i = p->stage + 1; i < p->levels; i++) p->edgeCut[i] = 0;
731   for (i = 0; i < p->stage; i++) pedgeCut += p->edgeCut[i];
732   part->edgeCut = -1;
733   PetscCallMPI(MPI_Reduce(&pedgeCut, &part->edgeCut, 1, MPIU_INT, MPI_SUM, 0, comm));
734 
735   PetscCallMPI(MPI_Group_free(&group));
736   if (pgroup != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&pgroup));
737   PetscCall(PetscLogEventEnd(PetscPartitioner_MS_Stage[PetscMin(p->stage, PETSCPARTITIONER_MS_MAXSTAGE)], part, 0, 0, 0));
738   PetscFunctionReturn(PETSC_SUCCESS);
739 }
740 
PetscPartitionerSetFromOptions_Multistage(PetscPartitioner part,PetscOptionItems PetscOptionsObject)741 static PetscErrorCode PetscPartitionerSetFromOptions_Multistage(PetscPartitioner part, PetscOptionItems PetscOptionsObject)
742 {
743   PetscPartitioner_MS *p        = (PetscPartitioner_MS *)part->data;
744   PetscEnum            strategy = (PetscEnum)PETSCPARTITIONER_MS_STRATEGY_NODE;
745   PetscBool            set, roundrobin;
746   PetscInt             options[3] = {PETSC_DETERMINE, PETSC_DETERMINE, PETSC_DETERMINE};
747 
748   PetscFunctionBegin;
749   PetscOptionsHeadBegin(PetscOptionsObject, "PetscPartitioner Multistate Options");
750   PetscCall(PetscOptionsEnum("-petscpartitioner_multistage_strategy", "Default partitioning strategy", "", PetscPartitionerMultistageStrategyList, strategy, &strategy, &set));
751   if (set || !p->levels) {
752     options[0] = (PetscInt)strategy;
753     switch (options[0]) {
754     case PETSCPARTITIONER_MS_STRATEGY_NODE:
755       options[1] = PETSC_DETERMINE;
756       roundrobin = PETSC_FALSE;
757       PetscCall(PetscOptionsInt("-petscpartitioner_multistage_node_size", "Number of processes per node", "", options[1], &options[1], NULL));
758       PetscCall(PetscOptionsBool("-petscpartitioner_multistage_node_interleaved", "Use round robin rank assignments", "", roundrobin, &roundrobin, NULL));
759       options[2] = (PetscInt)roundrobin;
760       break;
761     case PETSCPARTITIONER_MS_STRATEGY_MSECTION:
762       options[1] = 2;
763       PetscCall(PetscOptionsInt("-petscpartitioner_multistage_msection", "Number of splits per level", "", options[1], &options[1], NULL));
764       break;
765     default:
766       break; /* LCOV_EXCL_LINE */
767     }
768   }
769   PetscCall(PetscOptionsBool("-petscpartitioner_multistage_tpwgts", "Use target partition weights in stage partitioners", "", p->compute_tpwgs, &p->compute_tpwgs, NULL));
770   PetscCall(PetscOptionsBool("-petscpartitioner_multistage_viewtpwgts", "View target partition weights", "", p->view_tpwgs, &p->view_tpwgs, NULL));
771   PetscOptionsHeadEnd();
772 
773   if (options[0] != PETSC_DETERMINE) {
774     MPI_Comm   comm = PetscObjectComm((PetscObject)part);
775     PetscInt   nstages;
776     MPI_Group *groups;
777 
778     PetscCall(PetscPartitionerMultistage_CreateStages(comm, options, &nstages, &groups));
779     PetscCall(PetscPartitionerMultistageSetStages(part, nstages, groups));
780     PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &groups));
781   }
782 
783   {
784     PetscInt nstages = p->levels, l;
785     for (l = 0; l < nstages; l++) {
786       if (p->ppart[l]) PetscCall(PetscPartitionerSetFromOptions(p->ppart[l]));
787     }
788   }
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 
792 /*MC
793   PETSCPARTITIONERMULTISTAGE = "multistage" - A PetscPartitioner object using a multistage distribution strategy
794 
795   Level: intermediate
796 
797   Options Database Keys:
798 +  -petscpartitioner_multistage_strategy <strategy> - Either `PETSCPARTITIONER_MS_STRATEGY_NODE`, or `PETSCPARTITIONER_MS_STRATEGY_MSECTION`
799 .  -petscpartitioner_multistage_node_size <int> - Number of processes per computing node (or `PETSC_DECIDE`)
800 .  -petscpartitioner_multistage_node_interleaved <bool> - Assign ranks round-robin.
801 .  -petscpartitioner_multistage_msection <int> - Number of splits per level in recursive m-section splits (use `2` for bisection)
802 -  -petscpartitioner_multistage_tpwgts <bool> - Use target partition weights in stage partitioner
803 
804   Notes:
805   The default multistage strategy use `PETSCPARTITIONER_MS_STRATEGY_NODE` and automatically discovers node information using `MPI_Comm_split_type`.
806   `PETSCPARTITIONER_MS_STRATEGY_MSECTION` is more for testing purposes.
807   Options for single stage partitioners are prefixed by `-petscpartitioner_multistage_levels_`.
808   For example, to use parmetis in all stages, `-petscpartitioner_multistage_levels_petscpartitioner_type parmetis`
809   Finer grained control is also possible: `-petscpartitioner_multistage_levels_0_petscpartitioner_type parmetis`, `-petscpartitioner_multistage_levels_1_petscpartitioner_type simple`
810 
811 .seealso: `PetscPartitionerType`, `PetscPartitionerCreate()`, `PetscPartitionerSetType()`
812 M*/
813 
PetscPartitionerCreate_Multistage(PetscPartitioner part)814 PETSC_EXTERN PetscErrorCode PetscPartitionerCreate_Multistage(PetscPartitioner part)
815 {
816   PetscPartitioner_MS *p;
817 
818   PetscFunctionBegin;
819   PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
820   PetscCall(PetscNew(&p));
821   p->compute_tpwgs = PETSC_TRUE;
822 
823   part->data                = p;
824   part->ops->view           = PetscPartitionerView_Multistage;
825   part->ops->destroy        = PetscPartitionerDestroy_Multistage;
826   part->ops->partition      = PetscPartitionerPartition_Multistage;
827   part->ops->setfromoptions = PetscPartitionerSetFromOptions_Multistage;
828   part->ops->setup          = PetscPartitionerSetUp_Multistage;
829 
830   PetscCall(PetscCitationsRegister(MSPartitionerCitation, &MSPartitionerCite));
831   PetscFunctionReturn(PETSC_SUCCESS);
832 }
833