1 #include <petscsf.h>
2 #include <petscdmplex.h>
3 #include <petsc/private/dmimpl.h>
4 #include <petsc/private/dmpleximpl.h>
5 #include <petsc/private/partitionerimpl.h> /*I "petscpartitioner.h" I*/
6
7 PetscBool MSPartitionerCite = PETSC_FALSE;
8 const char MSPartitionerCitation[] = "@article{PETScMSPart2021,\n"
9 " author = {Parsani, Matteo and Boukharfane, Radouan and Nolasco, Irving Reyna and Fern{\'a}ndez, David C Del Rey and Zampini, Stefano and Hadri, Bilel and Dalcin, Lisandro},\n"
10 " title = {High-order accurate entropy-stable discontinuous collocated Galerkin methods with the summation-by-parts property for compressible {CFD} frameworks: Scalable {SSDC} algorithms and flow solver},\n"
11 " journal = {Journal of Computational Physics},\n"
12 " volume = {424},\n"
13 " pages = {109844},\n"
14 " year = {2021}\n"
15 "}\n";
16
17 PetscLogEvent PetscPartitioner_MS_SetUp;
18 PetscLogEvent PetscPartitioner_MS_Stage[PETSCPARTITIONER_MS_NUMSTAGE];
19
20 typedef struct {
21 PetscInt levels;
22 MPI_Group *lgroup;
23 /* Need access to the DM in inner stages */
24 PetscInt stage;
25 DM stagedm;
26 /* Stage partitioners */
27 PetscPartitioner *ppart;
28 /* Diagnostic */
29 PetscInt *edgeCut;
30 /* Target partition weights */
31 PetscBool view_tpwgs;
32 PetscSection *tpwgs;
33 PetscBool compute_tpwgs;
34 } PetscPartitioner_MS;
35
36 const char *const PetscPartitionerMultistageStrategyList[] = {"NODE", "MSECTION", "PetscPartitionerMultistageStrategy", "PETSCPARTITIONER_MS_", NULL};
37
PetscLCM_Local(void * in,void * out,PetscMPIInt * cnt,MPI_Datatype * datatype)38 static void PetscLCM_Local(void *in, void *out, PetscMPIInt *cnt, MPI_Datatype *datatype)
39 {
40 PetscInt count = *cnt;
41 PetscInt *xin = (PetscInt *)in, *xout = (PetscInt *)out;
42
43 PetscFunctionBegin;
44 if (*datatype != MPIU_INT) {
45 (void)(*PetscErrorPrintf)("Can only handle MPIU_INT");
46 PETSCABORT(MPI_COMM_SELF, PETSC_ERR_ARG_WRONG);
47 }
48 for (PetscInt i = 0; i < count; i++) xout[i] = PetscLCM(xin[i], xout[i]);
49 PetscFunctionReturnVoid();
50 }
51
PetscPartitionerView_Multistage(PetscPartitioner part,PetscViewer viewer)52 static PetscErrorCode PetscPartitionerView_Multistage(PetscPartitioner part, PetscViewer viewer)
53 {
54 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
55 PetscViewerFormat format;
56 PetscBool iascii;
57
58 PetscFunctionBegin;
59 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
60 PetscCall(PetscViewerGetFormat(viewer, &format));
61 if (iascii) {
62 MPI_Comm comm;
63 MPI_Group group;
64
65 PetscCall(PetscViewerASCIIPushTab(viewer));
66 if (!p->stagedm || p->stage == 0) PetscCall(PetscViewerASCIIPrintf(viewer, "Multistage graph partitioner: total stages %" PetscInt_FMT "\n", p->levels));
67 comm = PetscObjectComm((PetscObject)part);
68 PetscCallMPI(MPI_Comm_group(comm, &group));
69 for (PetscInt l = 0; l < p->levels; l++) {
70 PetscPartitioner ppart = p->ppart[l];
71 MPI_Comm pcomm = PetscObjectComm((PetscObject)ppart);
72 MPI_Group pgroup = MPI_GROUP_EMPTY;
73
74 PetscCall(PetscViewerASCIIPushTab(viewer));
75 if (l) {
76 if (pcomm != MPI_COMM_NULL) PetscCallMPI(MPI_Comm_group(pcomm, &pgroup));
77 } else {
78 pgroup = p->lgroup[0];
79 }
80
81 if (l) {
82 IS is;
83 PetscMPIInt gr, gem = 1;
84 PetscInt uniq;
85
86 PetscCallMPI(MPI_Group_size(group, &gr));
87 if (pgroup != MPI_GROUP_EMPTY) {
88 gem = 0;
89 PetscCallMPI(MPI_Group_translate_ranks(pgroup, 1, &gem, group, &gr));
90 }
91 PetscCall(ISCreateStride(PetscObjectComm((PetscObject)part), 1, gr, 1, &is));
92 PetscCall(ISRenumber(is, NULL, &uniq, NULL));
93 PetscCall(ISDestroy(&is));
94 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gem, 1, MPI_INT, MPI_SUM, PetscObjectComm((PetscObject)part)));
95 if (gem) uniq--;
96 if (!p->stagedm || l == p->stage) PetscCall(PetscViewerASCIIPrintf(viewer, "Stage %" PetscInt_FMT " partitioners (%" PetscInt_FMT " unique groups, %d empty processes)\n", l, uniq, gem));
97 } else {
98 PetscMPIInt psize;
99 PetscCallMPI(MPI_Group_size(pgroup, &psize));
100 if (!p->stagedm || l == p->stage) PetscCall(PetscViewerASCIIPrintf(viewer, "Stage %" PetscInt_FMT " partitioner to %d processes\n", l, psize));
101 }
102 PetscCall(PetscViewerFlush(viewer));
103 if (format == PETSC_VIEWER_ASCII_INFO_DETAIL && (!p->stagedm || l == p->stage)) {
104 PetscViewer pviewer;
105
106 if (pcomm == MPI_COMM_NULL) pcomm = PETSC_COMM_SELF;
107 PetscCall(PetscViewerGetSubViewer(viewer, pcomm, &pviewer));
108 if (ppart) {
109 PetscMPIInt size, *ranks, *granks;
110 char tstr[16], strranks[3072]; /* I'm lazy: max 12 chars (including spaces) per rank -> 256 ranks max*/
111
112 PetscCallMPI(MPI_Group_size(pgroup, &size));
113 PetscCall(PetscMalloc2(size, &ranks, size, &granks));
114 for (PetscMPIInt i = 0; i < size; i++) ranks[i] = i;
115 PetscCallMPI(MPI_Group_translate_ranks(pgroup, size, ranks, group, granks));
116 if (size <= 256) {
117 PetscCall(PetscStrncpy(strranks, "", sizeof(strranks)));
118 for (PetscInt i = 0; i < size; i++) {
119 PetscCall(PetscSNPrintf(tstr, sizeof(tstr), " %d", granks[i]));
120 PetscCall(PetscStrlcat(strranks, tstr, sizeof(strranks)));
121 }
122 } else PetscCall(PetscStrncpy(strranks, " not showing > 256", sizeof(strranks))); /* LCOV_EXCL_LINE */
123 PetscCall(PetscFree2(ranks, granks));
124 if (!l) {
125 PetscCall(PetscViewerASCIIPrintf(pviewer, "Destination ranks:%s\n", strranks));
126 PetscCall(PetscPartitionerView(ppart, pviewer));
127 } else {
128 PetscCall(PetscPartitionerView(ppart, pviewer));
129 PetscCall(PetscViewerASCIIPrintf(pviewer, " Participating ranks:%s\n", strranks));
130 }
131 if (p->view_tpwgs) PetscCall(PetscSectionView(p->tpwgs[l], pviewer));
132 }
133 PetscCall(PetscViewerRestoreSubViewer(viewer, pcomm, &pviewer));
134 }
135 PetscCall(PetscViewerFlush(viewer));
136
137 if (l && pgroup != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&pgroup));
138 }
139 for (PetscInt l = 0; l < p->levels; l++) PetscCall(PetscViewerASCIIPopTab(viewer));
140 PetscCall(PetscViewerASCIIPopTab(viewer));
141 PetscCallMPI(MPI_Group_free(&group));
142 }
143 PetscFunctionReturn(PETSC_SUCCESS);
144 }
145
PetscPartitionerMultistage_CreateStages(MPI_Comm comm,const PetscInt * options,PetscInt * nlevels,MPI_Group * levels[])146 static PetscErrorCode PetscPartitionerMultistage_CreateStages(MPI_Comm comm, const PetscInt *options, PetscInt *nlevels, MPI_Group *levels[])
147 {
148 MPI_Comm ncomm;
149 MPI_Group *lgroup;
150 MPI_Group ggroup, ngroup;
151 PetscMPIInt *ranks, *granks;
152 PetscMPIInt size, nsize, isize, rank, nrank, i, l, n, m;
153 PetscInt strategy = options ? options[0] : (PetscInt)PETSCPARTITIONER_MS_STRATEGY_NODE;
154
155 PetscFunctionBegin;
156 PetscCallMPI(MPI_Comm_size(comm, &size));
157 PetscCallMPI(MPI_Comm_rank(comm, &rank));
158
159 if (size == 1) {
160 PetscCall(PetscMalloc1(1, &lgroup));
161 PetscCallMPI(MPI_Comm_group(comm, &lgroup[0]));
162 *nlevels = 1;
163 *levels = lgroup;
164 PetscFunctionReturn(PETSC_SUCCESS);
165 }
166
167 switch (strategy) {
168 case PETSCPARTITIONER_MS_STRATEGY_NODE:
169 /* create groups (in global rank ordering of comm) for the 2-level partitioner */
170 if (options && options[1] > 0) {
171 PetscMPIInt node_size = (PetscMPIInt)options[1];
172 if (node_size > 1) {
173 PetscMPIInt color;
174 if (options[2]) { /* interleaved */
175 PetscMPIInt ngroups = size / node_size;
176
177 if (size % node_size) ngroups += 1;
178 color = rank % ngroups;
179 } else {
180 color = rank / node_size;
181 }
182 PetscCallMPI(MPI_Comm_split(comm, color, rank, &ncomm));
183 } else {
184 PetscCallMPI(MPI_Comm_dup(comm, &ncomm));
185 }
186 } else {
187 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
188 PetscCallMPI(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &ncomm));
189 #else
190 /* if users do not specify the node size and MPI_Comm_split_type is not available, defaults to same comm */
191 PetscCallMPI(MPI_Comm_dup(comm, &ncomm));
192 #endif
193 }
194
195 PetscCallMPI(MPI_Comm_size(ncomm, &nsize));
196 if (nsize == size) { /* one node */
197 PetscCall(PetscMalloc1(1, &lgroup));
198 PetscCallMPI(MPI_Comm_group(ncomm, &lgroup[0]));
199 PetscCallMPI(MPI_Comm_free(&ncomm));
200
201 *nlevels = 1;
202 *levels = lgroup;
203 break;
204 }
205
206 PetscCall(PetscMalloc1(2, &lgroup));
207
208 /* intranode group (in terms of global rank) */
209 PetscCall(PetscMalloc2(size, &ranks, nsize, &granks));
210 for (i = 0; i < nsize; i++) ranks[i] = i;
211 PetscCallMPI(MPI_Comm_group(comm, &ggroup));
212 PetscCallMPI(MPI_Comm_group(ncomm, &ngroup));
213 PetscCallMPI(MPI_Group_translate_ranks(ngroup, nsize, ranks, ggroup, granks));
214 PetscCallMPI(MPI_Group_incl(ggroup, nsize, granks, &lgroup[1]));
215
216 /* internode group (master processes on the nodes only)
217 this group must be specified by all processes in the comm
218 we need to gather those master ranks */
219 PetscCallMPI(MPI_Group_rank(ngroup, &nrank));
220 granks[0] = !nrank ? rank : -1;
221 PetscCallMPI(MPI_Allgather(granks, 1, MPI_INT, ranks, 1, MPI_INT, comm));
222 for (i = 0, isize = 0; i < size; i++)
223 if (ranks[i] >= 0) ranks[isize++] = ranks[i];
224 PetscCallMPI(MPI_Group_incl(ggroup, isize, ranks, &lgroup[0]));
225
226 PetscCall(PetscFree2(ranks, granks));
227 PetscCallMPI(MPI_Group_free(&ggroup));
228 PetscCallMPI(MPI_Group_free(&ngroup));
229 PetscCallMPI(MPI_Comm_free(&ncomm));
230
231 *nlevels = 2;
232 *levels = lgroup;
233 break;
234
235 case PETSCPARTITIONER_MS_STRATEGY_MSECTION:
236 /* recursive m-section (m=2 bisection) */
237 m = options ? (PetscMPIInt)options[1] : 2;
238 PetscCheck(m >= 2, comm, PETSC_ERR_SUP, "Invalid split %d, must be greater than one", m);
239 l = 0;
240 n = 1;
241 while (n <= size) {
242 l++;
243 n *= m;
244 }
245 l -= 1;
246 n /= m;
247
248 PetscCheck(l != 0, comm, PETSC_ERR_SUP, "Invalid split %d with communicator size %d", m, size);
249 PetscCall(PetscMalloc1(l, &lgroup));
250 for (i = 1; i < l; i++) lgroup[i] = MPI_GROUP_EMPTY;
251
252 if (l > 1) {
253 PetscMPIInt rem = size - n;
254 PetscCall(PetscMalloc1((m + rem), &ranks));
255 for (i = 0; i < m; i++) ranks[i] = i * (n / m);
256 for (i = 0; i < rem; i++) ranks[i + m] = n + i;
257 PetscCallMPI(MPI_Comm_group(comm, &ggroup));
258 PetscCallMPI(MPI_Group_incl(ggroup, m + rem, ranks, &lgroup[0]));
259 if (rank < n) {
260 for (i = 1; i < l; i++) {
261 PetscMPIInt inc = (PetscMPIInt)PetscPowInt(m, l - i - 1);
262 if (rank % inc == 0) {
263 PetscMPIInt anch = (rank - rank % (inc * m)), r;
264 for (r = 0; r < m; r++) ranks[r] = anch + r * inc;
265 PetscCallMPI(MPI_Group_incl(ggroup, m, ranks, &lgroup[i]));
266 }
267 }
268 }
269 PetscCall(PetscFree(ranks));
270 PetscCallMPI(MPI_Group_free(&ggroup));
271 } else {
272 PetscCallMPI(MPI_Comm_group(comm, &lgroup[0]));
273 }
274
275 *nlevels = l;
276 *levels = lgroup;
277 break;
278
279 default:
280 SETERRQ(comm, PETSC_ERR_SUP, "Not implemented"); /* LCOV_EXCL_LINE */
281 }
282 PetscFunctionReturn(PETSC_SUCCESS);
283 }
284
PetscPartitionerMultistage_DestroyStages(PetscInt nstages,MPI_Group * groups[])285 static PetscErrorCode PetscPartitionerMultistage_DestroyStages(PetscInt nstages, MPI_Group *groups[])
286 {
287 PetscFunctionBegin;
288 for (PetscInt l = 0; l < nstages; l++) {
289 MPI_Group group = (*groups)[l];
290 if (group != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&group));
291 }
292 PetscCall(PetscFree(*groups));
293 PetscFunctionReturn(PETSC_SUCCESS);
294 }
295
PetscPartitionerReset_Multistage(PetscPartitioner part)296 static PetscErrorCode PetscPartitionerReset_Multistage(PetscPartitioner part)
297 {
298 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
299 PetscInt nstages = p->levels;
300
301 PetscFunctionBegin;
302 p->levels = 0;
303 PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &p->lgroup));
304 for (PetscInt l = 0; l < nstages; l++) {
305 PetscCall(PetscPartitionerDestroy(&p->ppart[l]));
306 PetscCall(PetscSectionDestroy(&p->tpwgs[l]));
307 }
308 PetscCall(PetscFree(p->ppart));
309 PetscCall(PetscFree(p->tpwgs));
310 PetscCall(PetscFree(p->edgeCut));
311 PetscFunctionReturn(PETSC_SUCCESS);
312 }
313
PetscPartitionerDestroy_Multistage(PetscPartitioner part)314 static PetscErrorCode PetscPartitionerDestroy_Multistage(PetscPartitioner part)
315 {
316 PetscFunctionBegin;
317 PetscCall(PetscPartitionerReset_Multistage(part));
318 PetscCall(PetscFree(part->data));
319 PetscFunctionReturn(PETSC_SUCCESS);
320 }
321
322 /*@
323 PetscPartitionerMultistageSetStages - Sets stages for the partitioning
324
325 Collective
326
327 Input Parameters:
328 + part - the `PetscPartitioner` object.
329 . levels - the number of stages.
330 - lgroup - array of `MPI_Group`s for each stage.
331
332 Level: advanced
333
334 Notes:
335 `MPI_Comm_create(comm, lgroup[l], &lcomm)` is used to compute the communicator for the stage partitioner at level `l`.
336 The groups must be specified in the process numbering of the partitioner communicator.
337 `lgroup[0]` must be collectively specified and it must represent a proper subset of the communicator associated with the original partitioner.
338 For each level, ranks can be listed in one group only (but they can be listed on different levels)
339
340 .seealso: `PetscPartitionerSetType()`, `PetscPartitionerDestroy()`, `PETSCPARTITIONERMULTISTAGE`
341 @*/
PetscPartitionerMultistageSetStages(PetscPartitioner part,PetscInt levels,MPI_Group lgroup[])342 PetscErrorCode PetscPartitionerMultistageSetStages(PetscPartitioner part, PetscInt levels, MPI_Group lgroup[])
343 {
344 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
345 MPI_Comm comm;
346 MPI_Group wgroup;
347 PetscMPIInt **lparts = NULL;
348 PetscMPIInt rank, size;
349 PetscBool touched, isms;
350
351 PetscFunctionBegin;
352 PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
353 PetscValidLogicalCollectiveInt(part, levels, 2);
354 PetscCheck(levels >= 0, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Number of levels must be non-negative");
355 if (levels) PetscAssertPointer(lgroup, 3);
356 PetscCall(PetscObjectTypeCompare((PetscObject)part, PETSCPARTITIONERMULTISTAGE, &isms));
357 if (!isms) PetscFunctionReturn(PETSC_SUCCESS);
358 PetscCall(PetscLogEventBegin(PetscPartitioner_MS_SetUp, part, 0, 0, 0));
359
360 PetscCall(PetscPartitionerReset_Multistage(part));
361
362 PetscCall(PetscObjectGetComm((PetscObject)part, &comm));
363 PetscCallMPI(MPI_Comm_group(comm, &wgroup));
364 PetscCallMPI(MPI_Comm_rank(comm, &rank));
365 PetscCallMPI(MPI_Comm_size(comm, &size));
366
367 p->levels = levels;
368 PetscCall(PetscCalloc1(p->levels, &p->ppart));
369 PetscCall(PetscCalloc1(p->levels, &p->lgroup));
370 PetscCall(PetscCalloc1(p->levels, &p->tpwgs));
371 PetscCall(PetscCalloc1(p->levels, &p->edgeCut));
372
373 /* support for target partition weights */
374 touched = PETSC_FALSE;
375 if (p->compute_tpwgs) PetscCall(PetscMalloc1(p->levels, &lparts));
376
377 for (PetscInt l = 0; l < p->levels; l++) {
378 const char *prefix;
379 char aprefix[256];
380 MPI_Comm lcomm;
381
382 if (l) { /* let MPI complain/hang if the user did not specify the groups properly */
383 PetscCallMPI(MPI_Comm_create(comm, lgroup[l], &lcomm));
384 } else { /* in debug mode, we check that the initial group must be consistently (collectively) specified on comm */
385 #if defined(PETSC_USE_DEBUG)
386 MPI_Group group, igroup = lgroup[0];
387 PetscMPIInt *ranks, *granks;
388 PetscMPIInt b[2], b2[2], csize, gsize;
389
390 PetscCallMPI(MPI_Group_size(igroup, &gsize));
391 b[0] = -gsize;
392 b[1] = +gsize;
393 PetscCallMPI(MPIU_Allreduce(b, b2, 2, MPI_INT, MPI_MAX, comm));
394 PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Initial group must be collectively specified");
395 PetscCallMPI(MPI_Comm_group(comm, &group));
396 PetscCallMPI(MPI_Group_size(group, &csize));
397 PetscCall(PetscMalloc2(gsize, &ranks, (csize * gsize), &granks));
398 for (PetscMPIInt i = 0; i < gsize; i++) granks[i] = i;
399 PetscCallMPI(MPI_Group_translate_ranks(igroup, gsize, granks, group, ranks));
400 PetscCallMPI(MPI_Group_free(&group));
401 PetscCallMPI(MPI_Allgather(ranks, gsize, MPI_INT, granks, gsize, MPI_INT, comm));
402 for (PetscMPIInt i = 0; i < gsize; i++) {
403 PetscMPIInt chkr = granks[i];
404 for (PetscMPIInt j = 0; j < csize; j++) {
405 PetscMPIInt shft = j * gsize + i;
406 PetscCheck(chkr == granks[shft], comm, PETSC_ERR_ARG_WRONG, "Initial group must be collectively specified");
407 }
408 }
409 PetscCall(PetscFree2(ranks, granks));
410 #endif
411 lcomm = comm;
412 }
413 PetscCall(PetscObjectGetOptionsPrefix((PetscObject)part, &prefix));
414 if (lcomm != MPI_COMM_NULL) {
415 /* MPI_Group_dup */
416 PetscCallMPI(MPI_Group_union(lgroup[l], MPI_GROUP_EMPTY, &p->lgroup[l]));
417 PetscCall(PetscPartitionerCreate(lcomm, &p->ppart[l]));
418 PetscCall(PetscObjectSetOptionsPrefix((PetscObject)p->ppart[l], prefix));
419 PetscCall(PetscSNPrintf(aprefix, sizeof(aprefix), "petscpartitioner_multistage_levels_%" PetscInt_FMT "_", l));
420 PetscCall(PetscObjectAppendOptionsPrefix((PetscObject)p->ppart[l], aprefix));
421 } else {
422 PetscCheck(l != 0, PetscObjectComm((PetscObject)part), PETSC_ERR_USER, "Invalid first group");
423 p->lgroup[l] = MPI_GROUP_EMPTY;
424 }
425 if (lcomm != comm && lcomm != MPI_COMM_NULL) PetscCallMPI(MPI_Comm_free(&lcomm));
426
427 /* compute number of partitions per level and detect if a process is part of the process (at any level) */
428 if (p->compute_tpwgs) {
429 PetscMPIInt gsize;
430 PetscCall(PetscMalloc1(size, &lparts[l]));
431 PetscCallMPI(MPI_Group_size(p->lgroup[l], &gsize));
432 if (!l) {
433 PetscMPIInt tr;
434 PetscCallMPI(MPI_Group_translate_ranks(wgroup, 1, &rank, p->lgroup[0], &tr));
435 if (tr == MPI_UNDEFINED) gsize = 0;
436 }
437 if (touched && !gsize) gsize = 1;
438 PetscCallMPI(MPI_Allgather(&gsize, 1, MPI_INT, lparts[l], 1, MPI_INT, comm));
439 if (lparts[l][rank]) touched = PETSC_TRUE;
440 }
441 }
442
443 /* determine weights (bottom-up) */
444 if (p->compute_tpwgs) {
445 PetscMPIInt *tranks;
446 PetscInt *lwgts, wgt;
447 MPI_Op MPIU_LCM; /* XXX this is actually recreated at every setup */
448
449 PetscCall(PetscMalloc1((2 * size), &tranks));
450
451 /* we need to compute the least common multiple across processes */
452 PetscCallMPI(MPI_Op_create(PetscLCM_Local, 1, &MPIU_LCM));
453
454 /* final target has to have all ones as weights (if the process gets touched) */
455 wgt = touched ? 1 : 0;
456 PetscCall(PetscMalloc1(size, &lwgts));
457 PetscCallMPI(MPI_Allgather(&wgt, 1, MPIU_INT, lwgts, 1, MPIU_INT, comm));
458
459 /* now go up the hierarchy and populate the PetscSection describing the partition weights */
460 for (PetscInt l = p->levels - 1; l >= 0; l--) {
461 MPI_Comm pcomm;
462 MPI_Group igroup;
463 PetscMPIInt isize, isizer = 0;
464 PetscInt a, b, wgtr;
465 PetscMPIInt gsize;
466 PetscBool usep = PETSC_FALSE;
467
468 if (p->ppart[l]) {
469 PetscCall(PetscObjectGetComm((PetscObject)p->ppart[l], &pcomm));
470 } else {
471 pcomm = PETSC_COMM_SELF;
472 }
473
474 PetscCallMPI(MPI_Group_size(p->lgroup[l], &gsize));
475 if (gsize) {
476 usep = PETSC_TRUE;
477 for (PetscMPIInt i = 0; i < gsize; i++) tranks[i] = i;
478 PetscCallMPI(MPI_Group_translate_ranks(p->lgroup[l], gsize, tranks, wgroup, tranks + size));
479 } else gsize = lparts[l][rank];
480 PetscCall(PetscFree(lparts[l]));
481 PetscCall(PetscSectionCreate(pcomm, &p->tpwgs[l]));
482 PetscCall(PetscObjectSetName((PetscObject)p->tpwgs[l], "Target partition weights"));
483 PetscCall(PetscSectionSetChart(p->tpwgs[l], 0, gsize));
484
485 if (usep) {
486 PetscMPIInt *tr = tranks + size;
487 for (PetscMPIInt i = 0; i < gsize; i++) PetscCall(PetscSectionSetDof(p->tpwgs[l], i, lwgts[tr[i]]));
488 } else if (gsize) {
489 PetscCall(PetscSectionSetDof(p->tpwgs[l], 0, 1));
490 }
491 PetscCall(PetscSectionSetUp(p->tpwgs[l]));
492 if (!l) break;
493
494 /* determine number of processes shared by two consecutive levels */
495 PetscCallMPI(MPI_Group_intersection(p->lgroup[l], p->lgroup[l - 1], &igroup));
496 PetscCallMPI(MPI_Group_size(igroup, &isize));
497 PetscCallMPI(MPI_Group_free(&igroup));
498 if (!p->ppart[l] && touched) isize = 1; /* if this process gets touched, needs to propagate its data from one level to the other */
499
500 /* reduce on level partitioner comm the size of the max size of the igroup */
501 PetscCallMPI(MPIU_Allreduce(&isize, &isizer, 1, MPI_INT, MPI_MAX, pcomm));
502
503 /* sum previously computed partition weights on the level comm */
504 wgt = lwgts[rank];
505 wgtr = wgt;
506 PetscCallMPI(MPIU_Allreduce(&wgt, &wgtr, 1, MPIU_INT, MPI_SUM, pcomm));
507
508 /* partition weights are given with integers; to properly compute these and be able to propagate them to the next level,
509 we need to compute the least common multiple of isizer across the global comm */
510 a = isizer ? isizer : 1;
511 b = a;
512 PetscCallMPI(MPIU_Allreduce(&a, &b, 1, MPIU_INT, MPIU_LCM, comm));
513
514 /* finally share this process weight with all the other processes */
515 wgt = isizer ? (b * wgtr) / isizer : 0;
516 PetscCallMPI(MPI_Allgather(&wgt, 1, MPIU_INT, lwgts, 1, MPIU_INT, comm));
517 }
518 PetscCall(PetscFree(lwgts));
519 PetscCall(PetscFree(tranks));
520 PetscCall(PetscFree(lparts));
521 PetscCallMPI(MPI_Op_free(&MPIU_LCM));
522 }
523 PetscCallMPI(MPI_Group_free(&wgroup));
524 PetscCall(PetscLogEventEnd(PetscPartitioner_MS_SetUp, part, 0, 0, 0));
525 PetscFunctionReturn(PETSC_SUCCESS);
526 }
527
PetscPartitionerMultistageGetStages_Multistage(PetscPartitioner part,PetscInt * levels,MPI_Group * lgroup[])528 PetscErrorCode PetscPartitionerMultistageGetStages_Multistage(PetscPartitioner part, PetscInt *levels, MPI_Group *lgroup[])
529 {
530 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
531
532 PetscFunctionBegin;
533 PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
534 PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
535 if (levels) *levels = p->levels;
536 if (lgroup) *lgroup = p->lgroup;
537 PetscFunctionReturn(PETSC_SUCCESS);
538 }
539
PetscPartitionerMultistageSetStage_Multistage(PetscPartitioner part,PetscInt stage,PetscObject odm)540 PetscErrorCode PetscPartitionerMultistageSetStage_Multistage(PetscPartitioner part, PetscInt stage, PetscObject odm)
541 {
542 DM dm = (DM)odm;
543 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
544
545 PetscFunctionBegin;
546 PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
547 PetscCheck(p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ORDER, "Number of stages not set yet");
548 PetscCheck(stage >= 0 && stage < p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Invalid stage index %" PetscInt_FMT ", not in [0,%" PetscInt_FMT ")", stage, p->levels);
549
550 p->stage = stage;
551 PetscCall(PetscObjectReference((PetscObject)dm));
552 PetscCall(DMDestroy(&p->stagedm));
553 p->stagedm = dm;
554 PetscFunctionReturn(PETSC_SUCCESS);
555 }
556
PetscPartitionerMultistageGetStage_Multistage(PetscPartitioner part,PetscInt * stage,PetscObject * odm)557 PetscErrorCode PetscPartitionerMultistageGetStage_Multistage(PetscPartitioner part, PetscInt *stage, PetscObject *odm)
558 {
559 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
560
561 PetscFunctionBegin;
562 PetscCheckTypeName(part, PETSCPARTITIONERMULTISTAGE);
563 if (stage) *stage = p->stage;
564 if (odm) *odm = (PetscObject)p->stagedm;
565 PetscFunctionReturn(PETSC_SUCCESS);
566 }
567
PetscPartitionerSetUp_Multistage(PetscPartitioner part)568 static PetscErrorCode PetscPartitionerSetUp_Multistage(PetscPartitioner part)
569 {
570 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
571 MPI_Comm comm = PetscObjectComm((PetscObject)part);
572 PetscInt nstages;
573 MPI_Group *groups;
574
575 PetscFunctionBegin;
576 if (p->levels) PetscFunctionReturn(PETSC_SUCCESS);
577 PetscCall(PetscPartitionerMultistage_CreateStages(comm, NULL, &nstages, &groups));
578 PetscCall(PetscPartitionerMultistageSetStages(part, nstages, groups));
579 PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &groups));
580 PetscFunctionReturn(PETSC_SUCCESS);
581 }
582
583 /* targetSection argument unused, target partition weights are computed internally */
PetscPartitionerPartition_Multistage(PetscPartitioner part,PetscInt nparts,PetscInt numVertices,PetscInt start[],PetscInt adjacency[],PetscSection vertSection,PetscSection edgeSection,PETSC_UNUSED PetscSection targetSection,PetscSection partSection,IS * partition)584 static PetscErrorCode PetscPartitionerPartition_Multistage(PetscPartitioner part, PetscInt nparts, PetscInt numVertices, PetscInt start[], PetscInt adjacency[], PetscSection vertSection, PetscSection edgeSection, PETSC_UNUSED PetscSection targetSection, PetscSection partSection, IS *partition)
585 {
586 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
587 PetscPartitioner ppart;
588 PetscSection ppartSection = NULL;
589 IS lpartition;
590 const PetscInt *idxs;
591 MPI_Comm comm, pcomm;
592 MPI_Group group, lgroup, pgroup;
593 PetscInt *pstart, *padjacency;
594 PetscInt nid, i, pedgeCut;
595 PetscMPIInt sameComm, pparts;
596 PetscBool freeadj = PETSC_FALSE;
597
598 PetscFunctionBegin;
599 PetscCheck(p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ORDER, "Number of stages not set yet");
600 PetscCheck(p->stage >= 0 && p->stage < p->levels, PetscObjectComm((PetscObject)part), PETSC_ERR_ARG_OUTOFRANGE, "Invalid stage index %" PetscInt_FMT ", not in [0,%" PetscInt_FMT ")", p->stage, p->levels);
601 PetscCall(PetscLogEventBegin(PetscPartitioner_MS_Stage[PetscMin(p->stage, PETSCPARTITIONER_MS_MAXSTAGE)], part, 0, 0, 0));
602
603 /* Group for current stage (size of the group defines number of "local" partitions) */
604 lgroup = p->lgroup[p->stage];
605 PetscCallMPI(MPI_Group_size(lgroup, &pparts));
606
607 /* Current stage partitioner */
608 ppart = p->ppart[p->stage];
609 if (ppart) {
610 PetscCall(PetscObjectGetComm((PetscObject)ppart, &pcomm));
611 PetscCallMPI(MPI_Comm_group(pcomm, &pgroup));
612 } else {
613 pcomm = PETSC_COMM_SELF;
614 pgroup = MPI_GROUP_EMPTY;
615 pparts = -1;
616 }
617
618 /* Global comm of partitioner */
619 PetscCall(PetscObjectGetComm((PetscObject)part, &comm));
620 PetscCallMPI(MPI_Comm_group(comm, &group));
621
622 /* Get adjacency */
623 PetscCallMPI(MPI_Group_compare(group, pgroup, &sameComm));
624 if (sameComm != MPI_UNEQUAL) {
625 pstart = start;
626 padjacency = adjacency;
627 } else { /* restrict to partitioner comm */
628 ISLocalToGlobalMapping l2g, g2l;
629 IS gid, rid;
630 const PetscInt *idxs1;
631 PetscInt *idxs2;
632 PetscInt cStart, cEnd, cum;
633 DM dm = p->stagedm;
634 PetscSF sf;
635
636 PetscCheck(dm, PetscObjectComm((PetscObject)part), PETSC_ERR_PLIB, "Missing stage DM");
637 PetscCall(DMGetPointSF(dm, &sf));
638 PetscCall(DMPlexGetHeightStratum(dm, part->height, &cStart, &cEnd));
639 PetscCall(DMPlexCreateNumbering_Plex(dm, cStart, cEnd, part->height, NULL, sf, &gid));
640 /* filter overlapped local cells (if any) */
641 PetscCall(ISGetIndices(gid, &idxs1));
642 PetscCall(ISGetLocalSize(gid, &cum));
643 PetscCall(PetscMalloc1(cum, &idxs2));
644 for (i = cStart, cum = 0; i < cEnd; i++) {
645 if (idxs1[i - cStart] < 0) continue;
646 idxs2[cum++] = idxs1[i - cStart];
647 }
648 PetscCall(ISRestoreIndices(gid, &idxs1));
649
650 PetscCheck(numVertices == cum, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected %" PetscInt_FMT " != %" PetscInt_FMT, numVertices, cum);
651 PetscCall(ISDestroy(&gid));
652
653 /*
654 g2l from full numbering to local numbering
655 l2g from local numbering to restricted numbering
656 */
657 PetscCall(ISCreateGeneral(pcomm, numVertices, idxs2, PETSC_OWN_POINTER, &gid));
658 PetscCall(ISRenumber(gid, NULL, NULL, &rid));
659 PetscCall(ISLocalToGlobalMappingCreateIS(gid, &g2l));
660 PetscCall(ISLocalToGlobalMappingSetType(g2l, ISLOCALTOGLOBALMAPPINGHASH));
661 PetscCall(ISLocalToGlobalMappingCreateIS(rid, &l2g));
662 PetscCall(ISDestroy(&gid));
663 PetscCall(ISDestroy(&rid));
664
665 PetscCall(PetscMalloc2(numVertices + 1, &pstart, start[numVertices], &padjacency));
666 pstart[0] = 0;
667 for (i = 0; i < numVertices; i++) {
668 PetscCall(ISGlobalToLocalMappingApply(g2l, IS_GTOLM_DROP, start[i + 1] - start[i], adjacency + start[i], &pstart[i + 1], padjacency + pstart[i]));
669 PetscCall(ISLocalToGlobalMappingApply(l2g, pstart[i + 1], padjacency + pstart[i], padjacency + pstart[i]));
670 pstart[i + 1] += pstart[i];
671 }
672 PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
673 PetscCall(ISLocalToGlobalMappingDestroy(&g2l));
674
675 freeadj = PETSC_TRUE;
676 }
677
678 /* Compute partitioning */
679 pedgeCut = 0;
680 PetscCall(PetscSectionCreate(pcomm, &ppartSection));
681 if (ppart) {
682 PetscMPIInt prank;
683
684 PetscCheck(ppart->ops->partition, PetscObjectComm((PetscObject)ppart), PETSC_ERR_ARG_WRONGSTATE, "PetscPartitioner has no partitioning method on stage %" PetscInt_FMT, p->stage);
685 PetscCall(PetscPartitionerPartition(ppart, pparts, numVertices, pstart, padjacency, vertSection, edgeSection, p->tpwgs[p->stage], ppartSection, &lpartition));
686 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)ppart), &prank));
687 if (!prank) pedgeCut = ppart->edgeCut; /* only the master rank will reduce */
688 } else { /* not participating */
689 PetscCall(ISCreateStride(PETSC_COMM_SELF, numVertices, 0, 1, &lpartition));
690 pparts = numVertices > 0 ? 1 : 0;
691 }
692 if (freeadj) PetscCall(PetscFree2(pstart, padjacency));
693
694 /* Init final partition (output) */
695 PetscCall(PetscSectionReset(partSection));
696 PetscCall(PetscSectionSetChart(partSection, 0, nparts));
697
698 /* We need to map the section back to the global comm numbering */
699 for (i = 0; i < pparts; i++) {
700 PetscInt dof;
701 PetscMPIInt mp, mpt;
702
703 if (lgroup != MPI_GROUP_EMPTY) {
704 PetscCall(PetscMPIIntCast(i, &mp));
705 PetscCall(PetscSectionGetDof(ppartSection, i, &dof));
706 PetscCallMPI(MPI_Group_translate_ranks(lgroup, 1, &mp, group, &mpt));
707 } else {
708 PetscCheck(pparts == 1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Unexpected pparts %d", pparts);
709 PetscCallMPI(MPI_Comm_rank(comm, &mpt));
710 PetscCall(ISGetLocalSize(lpartition, &dof));
711 }
712 PetscCall(PetscSectionSetDof(partSection, mpt, dof));
713 }
714 PetscCall(PetscSectionSetUp(partSection));
715 PetscCall(PetscSectionDestroy(&ppartSection));
716
717 /* No need to translate the "partition" output, as it is in local cell numbering
718 we only change the comm of the index set */
719 PetscCall(ISGetIndices(lpartition, &idxs));
720 PetscCall(ISGetLocalSize(lpartition, &nid));
721 PetscCall(ISCreateGeneral(comm, nid, idxs, PETSC_COPY_VALUES, partition));
722 PetscCall(ISRestoreIndices(lpartition, &idxs));
723 PetscCall(ISDestroy(&lpartition));
724
725 PetscCall(PetscSectionViewFromOptions(partSection, (PetscObject)part, "-petscpartitioner_multistage_partition_view"));
726 PetscCall(ISViewFromOptions(*partition, (PetscObject)part, "-petscpartitioner_multistage_partition_view"));
727
728 /* Update edge-cut */
729 p->edgeCut[p->stage] = pedgeCut;
730 for (i = p->stage + 1; i < p->levels; i++) p->edgeCut[i] = 0;
731 for (i = 0; i < p->stage; i++) pedgeCut += p->edgeCut[i];
732 part->edgeCut = -1;
733 PetscCallMPI(MPI_Reduce(&pedgeCut, &part->edgeCut, 1, MPIU_INT, MPI_SUM, 0, comm));
734
735 PetscCallMPI(MPI_Group_free(&group));
736 if (pgroup != MPI_GROUP_EMPTY) PetscCallMPI(MPI_Group_free(&pgroup));
737 PetscCall(PetscLogEventEnd(PetscPartitioner_MS_Stage[PetscMin(p->stage, PETSCPARTITIONER_MS_MAXSTAGE)], part, 0, 0, 0));
738 PetscFunctionReturn(PETSC_SUCCESS);
739 }
740
PetscPartitionerSetFromOptions_Multistage(PetscPartitioner part,PetscOptionItems PetscOptionsObject)741 static PetscErrorCode PetscPartitionerSetFromOptions_Multistage(PetscPartitioner part, PetscOptionItems PetscOptionsObject)
742 {
743 PetscPartitioner_MS *p = (PetscPartitioner_MS *)part->data;
744 PetscEnum strategy = (PetscEnum)PETSCPARTITIONER_MS_STRATEGY_NODE;
745 PetscBool set, roundrobin;
746 PetscInt options[3] = {PETSC_DETERMINE, PETSC_DETERMINE, PETSC_DETERMINE};
747
748 PetscFunctionBegin;
749 PetscOptionsHeadBegin(PetscOptionsObject, "PetscPartitioner Multistate Options");
750 PetscCall(PetscOptionsEnum("-petscpartitioner_multistage_strategy", "Default partitioning strategy", "", PetscPartitionerMultistageStrategyList, strategy, &strategy, &set));
751 if (set || !p->levels) {
752 options[0] = (PetscInt)strategy;
753 switch (options[0]) {
754 case PETSCPARTITIONER_MS_STRATEGY_NODE:
755 options[1] = PETSC_DETERMINE;
756 roundrobin = PETSC_FALSE;
757 PetscCall(PetscOptionsInt("-petscpartitioner_multistage_node_size", "Number of processes per node", "", options[1], &options[1], NULL));
758 PetscCall(PetscOptionsBool("-petscpartitioner_multistage_node_interleaved", "Use round robin rank assignments", "", roundrobin, &roundrobin, NULL));
759 options[2] = (PetscInt)roundrobin;
760 break;
761 case PETSCPARTITIONER_MS_STRATEGY_MSECTION:
762 options[1] = 2;
763 PetscCall(PetscOptionsInt("-petscpartitioner_multistage_msection", "Number of splits per level", "", options[1], &options[1], NULL));
764 break;
765 default:
766 break; /* LCOV_EXCL_LINE */
767 }
768 }
769 PetscCall(PetscOptionsBool("-petscpartitioner_multistage_tpwgts", "Use target partition weights in stage partitioners", "", p->compute_tpwgs, &p->compute_tpwgs, NULL));
770 PetscCall(PetscOptionsBool("-petscpartitioner_multistage_viewtpwgts", "View target partition weights", "", p->view_tpwgs, &p->view_tpwgs, NULL));
771 PetscOptionsHeadEnd();
772
773 if (options[0] != PETSC_DETERMINE) {
774 MPI_Comm comm = PetscObjectComm((PetscObject)part);
775 PetscInt nstages;
776 MPI_Group *groups;
777
778 PetscCall(PetscPartitionerMultistage_CreateStages(comm, options, &nstages, &groups));
779 PetscCall(PetscPartitionerMultistageSetStages(part, nstages, groups));
780 PetscCall(PetscPartitionerMultistage_DestroyStages(nstages, &groups));
781 }
782
783 {
784 PetscInt nstages = p->levels, l;
785 for (l = 0; l < nstages; l++) {
786 if (p->ppart[l]) PetscCall(PetscPartitionerSetFromOptions(p->ppart[l]));
787 }
788 }
789 PetscFunctionReturn(PETSC_SUCCESS);
790 }
791
792 /*MC
793 PETSCPARTITIONERMULTISTAGE = "multistage" - A PetscPartitioner object using a multistage distribution strategy
794
795 Level: intermediate
796
797 Options Database Keys:
798 + -petscpartitioner_multistage_strategy <strategy> - Either `PETSCPARTITIONER_MS_STRATEGY_NODE`, or `PETSCPARTITIONER_MS_STRATEGY_MSECTION`
799 . -petscpartitioner_multistage_node_size <int> - Number of processes per computing node (or `PETSC_DECIDE`)
800 . -petscpartitioner_multistage_node_interleaved <bool> - Assign ranks round-robin.
801 . -petscpartitioner_multistage_msection <int> - Number of splits per level in recursive m-section splits (use `2` for bisection)
802 - -petscpartitioner_multistage_tpwgts <bool> - Use target partition weights in stage partitioner
803
804 Notes:
805 The default multistage strategy use `PETSCPARTITIONER_MS_STRATEGY_NODE` and automatically discovers node information using `MPI_Comm_split_type`.
806 `PETSCPARTITIONER_MS_STRATEGY_MSECTION` is more for testing purposes.
807 Options for single stage partitioners are prefixed by `-petscpartitioner_multistage_levels_`.
808 For example, to use parmetis in all stages, `-petscpartitioner_multistage_levels_petscpartitioner_type parmetis`
809 Finer grained control is also possible: `-petscpartitioner_multistage_levels_0_petscpartitioner_type parmetis`, `-petscpartitioner_multistage_levels_1_petscpartitioner_type simple`
810
811 .seealso: `PetscPartitionerType`, `PetscPartitionerCreate()`, `PetscPartitionerSetType()`
812 M*/
813
PetscPartitionerCreate_Multistage(PetscPartitioner part)814 PETSC_EXTERN PetscErrorCode PetscPartitionerCreate_Multistage(PetscPartitioner part)
815 {
816 PetscPartitioner_MS *p;
817
818 PetscFunctionBegin;
819 PetscValidHeaderSpecific(part, PETSCPARTITIONER_CLASSID, 1);
820 PetscCall(PetscNew(&p));
821 p->compute_tpwgs = PETSC_TRUE;
822
823 part->data = p;
824 part->ops->view = PetscPartitionerView_Multistage;
825 part->ops->destroy = PetscPartitionerDestroy_Multistage;
826 part->ops->partition = PetscPartitionerPartition_Multistage;
827 part->ops->setfromoptions = PetscPartitionerSetFromOptions_Multistage;
828 part->ops->setup = PetscPartitionerSetUp_Multistage;
829
830 PetscCall(PetscCitationsRegister(MSPartitionerCitation, &MSPartitionerCite));
831 PetscFunctionReturn(PETSC_SUCCESS);
832 }
833