xref: /petsc/src/vec/is/sf/impls/window/sfwindow.c (revision bb4b53ef092968f72b740b90dbab8a2b6700db0d)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 
3 typedef struct _n_PetscSFDataLink *PetscSFDataLink;
4 typedef struct _n_PetscSFWinLink  *PetscSFWinLink;
5 
6 typedef struct {
7   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
8   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
9   PetscSFWinLink          wins;   /* List of active windows */
10   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
11   PetscSF                 dynsf;
12   MPI_Info                info;
13   MPI_Comm                window_comm;
14   PetscBool               is_empty;
15   PetscMPIInt            *wcommranks;
16 } PetscSF_Window;
17 
18 struct _n_PetscSFDataLink {
19   MPI_Datatype    unit;
20   MPI_Datatype   *mine;
21   MPI_Datatype   *remote;
22   PetscSFDataLink next;
23 };
24 
25 struct _n_PetscSFWinLink {
26   PetscBool               inuse;
27   MPI_Aint                bytes;
28   void                   *addr;
29   void                   *rootdata;
30   void                   *leafdata;
31   MPI_Win                 win;
32   MPI_Request            *reqs;
33   PetscSFWindowFlavorType flavor;
34   MPI_Aint               *dyn_target_addr;
35   PetscBool               epoch;
36   PetscBool               persistent;
37   PetscSFWinLink          next;
38 };
39 
40 const char *const PetscSFWindowSyncTypes[]   = {"FENCE", "LOCK", "ACTIVE", "PetscSFWindowSyncType", "PETSCSF_WINDOW_SYNC_", NULL};
41 const char *const PetscSFWindowFlavorTypes[] = {"CREATE", "DYNAMIC", "ALLOCATE", "SHARED", "PetscSFWindowFlavorType", "PETSCSF_WINDOW_FLAVOR_", NULL};
42 
43 /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
44 static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
45 {
46   PetscFunctionBegin;
47   if (*op == MPIU_SUM) *op = MPI_SUM;
48   else if (*op == MPIU_MAX) *op = MPI_MAX;
49   else if (*op == MPIU_MIN) *op = MPI_MIN;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 /*
54    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank
55 
56    Not Collective
57 
58    Input Parameters:
59 +  sf - star forest of type `PETSCSFWINDOW`
60 -  unit - data type for each node
61 
62    Output Parameters:
63 +  localtypes - types describing part of local leaf buffer referencing each remote rank
64 -  remotetypes - types describing part of remote root buffer referenced for each remote rank
65 
66    Level: developer
67 
68 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetGraph()`, `PetscSFView()`
69 @*/
70 static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf, MPI_Datatype unit, const MPI_Datatype **localtypes, const MPI_Datatype **remotetypes)
71 {
72   PetscSF_Window *w = (PetscSF_Window *)sf->data;
73   PetscSFDataLink link;
74   PetscInt        i, nranks;
75   const PetscInt *roffset, *rmine, *rremote;
76 
77   PetscFunctionBegin;
78   /* Look for types in cache */
79   for (link = w->link; link; link = link->next) {
80     PetscBool match;
81     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
82     if (match) {
83       *localtypes  = link->mine;
84       *remotetypes = link->remote;
85       PetscFunctionReturn(PETSC_SUCCESS);
86     }
87   }
88 
89   /* Create new composite types for each send rank */
90   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, &roffset, &rmine, &rremote));
91   PetscCall(PetscNew(&link));
92   PetscCallMPI(MPI_Type_dup(unit, &link->unit));
93   PetscCall(PetscMalloc2(nranks, &link->mine, nranks, &link->remote));
94   for (i = 0; i < nranks; i++) {
95     PetscInt     rcount = roffset[i + 1] - roffset[i];
96     PetscMPIInt *rmine, *rremote;
97 #if !defined(PETSC_USE_64BIT_INDICES)
98     rmine   = sf->rmine + sf->roffset[i];
99     rremote = sf->rremote + sf->roffset[i];
100 #else
101     PetscInt j;
102     PetscCall(PetscMalloc2(rcount, &rmine, rcount, &rremote));
103     for (j = 0; j < rcount; j++) {
104       PetscCall(PetscMPIIntCast(sf->rmine[sf->roffset[i] + j], rmine + j));
105       PetscCall(PetscMPIIntCast(sf->rremote[sf->roffset[i] + j], rremote + j));
106     }
107 #endif
108 
109     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rmine, link->unit, &link->mine[i]));
110     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rremote, link->unit, &link->remote[i]));
111 #if defined(PETSC_USE_64BIT_INDICES)
112     PetscCall(PetscFree2(rmine, rremote));
113 #endif
114     PetscCallMPI(MPI_Type_commit(&link->mine[i]));
115     PetscCallMPI(MPI_Type_commit(&link->remote[i]));
116   }
117   link->next = w->link;
118   w->link    = link;
119 
120   *localtypes  = link->mine;
121   *remotetypes = link->remote;
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 /*@
126   PetscSFWindowSetFlavorType - Set flavor type for `MPI_Win` creation
127 
128   Logically Collective
129 
130   Input Parameters:
131 + sf     - star forest for communication of type `PETSCSFWINDOW`
132 - flavor - flavor type
133 
134   Options Database Key:
135 . -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see `PetscSFWindowFlavorType`)
136 
137   Level: advanced
138 
139   Notes:
140   Windows reuse follows these rules\:
141 .vb
142      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create
143 
144      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
145        PetscSFRegisterPersistent(sf,rootdata1,leafdata);
146        for i=1 to K
147          PetscSFOperationBegin(sf,rootdata1,leafdata);
148          PetscSFOperationEnd(sf,rootdata1,leafdata);
149          ...
150          PetscSFOperationBegin(sf,rootdata1,leafdata);
151          PetscSFOperationEnd(sf,rootdata1,leafdata);
152        endfor
153        PetscSFDeregisterPersistent(sf,rootdata1,leafdata);
154 
155        The following pattern will instead raise an error
156          PetscSFOperationBegin(sf,rootdata1,leafdata);
157          PetscSFOperationEnd(sf,rootdata1,leafdata);
158          PetscSFOperationBegin(sf,rank ? rootdata1 : rootdata2,leafdata);
159          PetscSFOperationEnd(sf,rank ? rootdata1 : rootdata2,leafdata);
160 
161      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use
162 
163      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE
164 .ve
165 
166 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetFlavorType()`
167 @*/
168 PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf, PetscSFWindowFlavorType flavor)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
172   PetscValidLogicalCollectiveEnum(sf, flavor, 2);
173   PetscTryMethod(sf, "PetscSFWindowSetFlavorType_C", (PetscSF, PetscSFWindowFlavorType), (sf, flavor));
174   PetscFunctionReturn(PETSC_SUCCESS);
175 }
176 
177 static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType flavor)
178 {
179   PetscSF_Window *w = (PetscSF_Window *)sf->data;
180 
181   PetscFunctionBegin;
182   w->flavor = flavor;
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 /*@
187   PetscSFWindowGetFlavorType - Get  `PETSCSFWINDOW` flavor type for `PetscSF` communication
188 
189   Logically Collective
190 
191   Input Parameter:
192 . sf - star forest for communication of type `PETSCSFWINDOW`
193 
194   Output Parameter:
195 . flavor - flavor type
196 
197   Level: advanced
198 
199 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetFlavorType()`
200 @*/
201 PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf, PetscSFWindowFlavorType *flavor)
202 {
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
205   PetscAssertPointer(flavor, 2);
206   PetscUseMethod(sf, "PetscSFWindowGetFlavorType_C", (PetscSF, PetscSFWindowFlavorType *), (sf, flavor));
207   PetscFunctionReturn(PETSC_SUCCESS);
208 }
209 
210 static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType *flavor)
211 {
212   PetscSF_Window *w = (PetscSF_Window *)sf->data;
213 
214   PetscFunctionBegin;
215   *flavor = w->flavor;
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218 
219 /*@
220   PetscSFWindowSetSyncType - Set synchronization type for `PetscSF` communication of type  `PETSCSFWINDOW`
221 
222   Logically Collective
223 
224   Input Parameters:
225 + sf   - star forest for communication
226 - sync - synchronization type
227 
228   Options Database Key:
229 . -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see `PetscSFWindowSyncType`)
230 
231   Level: advanced
232 
233 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetSyncType()`, `PetscSFWindowSyncType`
234 @*/
235 PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf, PetscSFWindowSyncType sync)
236 {
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
239   PetscValidLogicalCollectiveEnum(sf, sync, 2);
240   PetscTryMethod(sf, "PetscSFWindowSetSyncType_C", (PetscSF, PetscSFWindowSyncType), (sf, sync));
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf, PetscSFWindowSyncType sync)
245 {
246   PetscSF_Window *w = (PetscSF_Window *)sf->data;
247 
248   PetscFunctionBegin;
249   w->sync = sync;
250   PetscFunctionReturn(PETSC_SUCCESS);
251 }
252 
253 /*@
254   PetscSFWindowGetSyncType - Get synchronization type for `PetscSF` communication of type `PETSCSFWINDOW`
255 
256   Logically Collective
257 
258   Input Parameter:
259 . sf - star forest for communication
260 
261   Output Parameter:
262 . sync - synchronization type
263 
264   Level: advanced
265 
266 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetSyncType()`, `PetscSFWindowSyncType`
267 @*/
268 PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf, PetscSFWindowSyncType *sync)
269 {
270   PetscFunctionBegin;
271   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
272   PetscAssertPointer(sync, 2);
273   PetscUseMethod(sf, "PetscSFWindowGetSyncType_C", (PetscSF, PetscSFWindowSyncType *), (sf, sync));
274   PetscFunctionReturn(PETSC_SUCCESS);
275 }
276 
277 static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf, PetscSFWindowSyncType *sync)
278 {
279   PetscSF_Window *w = (PetscSF_Window *)sf->data;
280 
281   PetscFunctionBegin;
282   *sync = w->sync;
283   PetscFunctionReturn(PETSC_SUCCESS);
284 }
285 
286 /*@C
287   PetscSFWindowSetInfo - Set the `MPI_Info` handle that will be used for subsequent windows allocation
288 
289   Logically Collective
290 
291   Input Parameters:
292 + sf   - star forest for communication
293 - info - `MPI_Info` handle
294 
295   Level: advanced
296 
297   Note:
298   The info handle is duplicated with a call to `MPI_Info_dup()` unless info = `MPI_INFO_NULL`.
299 
300 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetInfo()`
301 @*/
302 PetscErrorCode PetscSFWindowSetInfo(PetscSF sf, MPI_Info info)
303 {
304   PetscFunctionBegin;
305   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
306   PetscTryMethod(sf, "PetscSFWindowSetInfo_C", (PetscSF, MPI_Info), (sf, info));
307   PetscFunctionReturn(PETSC_SUCCESS);
308 }
309 
310 static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf, MPI_Info info)
311 {
312   PetscSF_Window *w = (PetscSF_Window *)sf->data;
313 
314   PetscFunctionBegin;
315   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
316   if (info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_dup(info, &w->info));
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 /*@C
321   PetscSFWindowGetInfo - Get the `MPI_Info` handle used for windows allocation
322 
323   Logically Collective
324 
325   Input Parameter:
326 . sf - star forest for communication
327 
328   Output Parameter:
329 . info - `MPI_Info` handle
330 
331   Level: advanced
332 
333   Note:
334   If `PetscSFWindowSetInfo()` has not be called, this returns `MPI_INFO_NULL`
335 
336 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetInfo()`
337 @*/
338 PetscErrorCode PetscSFWindowGetInfo(PetscSF sf, MPI_Info *info)
339 {
340   PetscFunctionBegin;
341   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
342   PetscAssertPointer(info, 2);
343   PetscUseMethod(sf, "PetscSFWindowGetInfo_C", (PetscSF, MPI_Info *), (sf, info));
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf, MPI_Info *info)
348 {
349   PetscSF_Window *w = (PetscSF_Window *)sf->data;
350 
351   PetscFunctionBegin;
352   *info = w->info;
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
356 static PetscErrorCode PetscSFWindowCreateDynamicSF(PetscSF sf, PetscSF *dynsf)
357 {
358   PetscSFNode *remotes;
359 
360   PetscFunctionBegin;
361   PetscCall(PetscMalloc1(sf->nranks, &remotes));
362   for (PetscInt i = 0; i < sf->nranks; i++) {
363     remotes[i].rank  = sf->ranks[i];
364     remotes[i].index = 0;
365   }
366   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, dynsf));
367   PetscCall(PetscSFSetType(*dynsf, PETSCSFBASIC)); /* break recursion */
368   PetscCall(PetscSFSetGraph(*dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER));
369   PetscFunctionReturn(PETSC_SUCCESS);
370 }
371 
372 static PetscErrorCode PetscSFWindowAttach(PetscSF sf, PetscSFWinLink link, void *rootdata, size_t wsize)
373 {
374   PetscFunctionBegin;
375 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
376   {
377     PetscSF_Window *w = (PetscSF_Window *)sf->data;
378     MPI_Comm        wcomm;
379     MPI_Aint        winaddr;
380     void           *addr = rootdata;
381     PetscInt        nranks;
382     // some Open MPI versions do not support MPI_Win_attach(win,NULL,0);
383     wcomm = w->window_comm;
384     if (addr != NULL) PetscCallMPI(MPI_Win_attach(link->win, addr, wsize));
385     link->addr = addr;
386     PetscCheck(w->dynsf, wcomm, PETSC_ERR_ORDER, "Must call PetscSFSetUp()");
387     PetscCall(PetscSFGetRootRanks(w->dynsf, &nranks, NULL, NULL, NULL, NULL));
388     PetscCallMPI(MPI_Get_address(addr, &winaddr));
389     if (!link->dyn_target_addr) PetscCall(PetscMalloc1(nranks, &link->dyn_target_addr));
390     PetscCall(PetscSFBcastBegin(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
391     PetscCall(PetscSFBcastEnd(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
392   }
393 #else
394   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "dynamic windows not supported");
395 #endif
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 /*
400    PetscSFGetWindow - Get a window for use with a given data type
401 
402    Collective
403 
404    Input Parameters:
405 +  sf - star forest
406 .  unit - data type
407 .  rootdata - array to be sent
408 .  leafdata - only used to help uniquely identify windows
409 .  sync - type of synchronization `PetscSFWindowSyncType`
410 .  epoch - `PETSC_TRUE` to acquire the window and start an epoch, `PETSC_FALSE` to just acquire the window
411 .  fenceassert - assert parameter for call to `MPI_Win_fence()`, if sync == `PETSCSF_WINDOW_SYNC_FENCE`
412 .  postassert - assert parameter for call to `MPI_Win_post()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
413 -  startassert - assert parameter for call to `MPI_Win_start()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
414 
415    Output Parameters:
416 +  target_disp - target_disp argument for RMA calls (significative for `PETSCSF_WINDOW_FLAVOR_DYNAMIC` only)
417 +  reqs - array of requests (significative for sync == `PETSCSF_WINDOW_SYNC_LOCK` only)
418 -  win - window
419 
420    Level: developer
421 
422 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetRootRanks()`, `PetscSFWindowGetDataTypes()`
423 */
424 
425 static PetscErrorCode PetscSFGetWindow(PetscSF sf, MPI_Datatype unit, void *rootdata, void *leafdata, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscMPIInt postassert, PetscMPIInt startassert, const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
426 {
427   PetscSF_Window *w = (PetscSF_Window *)sf->data;
428   MPI_Aint        bytes;
429   PetscSFWinLink  link;
430   PetscBool       reuse = PETSC_FALSE, update = PETSC_FALSE;
431   MPI_Aint        wsize;
432   MPI_Comm        wcomm;
433   PetscBool       is_empty;
434 
435   PetscFunctionBegin;
436   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
437   wsize    = (MPI_Aint)(bytes * sf->nroots);
438   wcomm    = w->window_comm;
439   is_empty = w->is_empty;
440   if (is_empty) {
441     if (target_disp) *target_disp = NULL;
442     if (reqs) *reqs = NULL;
443     *win = MPI_WIN_NULL;
444     PetscFunctionReturn(PETSC_SUCCESS);
445   }
446   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
447   if (PetscDefined(HAVE_MPI_FEATURE_DYNAMIC_WINDOW) && w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
448     // first search for a persistent window
449     for (link = w->wins; reuse && link; link = link->next) {
450       PetscBool match;
451 
452       if (!link->persistent) continue;
453       match = (link->flavor == w->flavor && link->rootdata == rootdata && link->leafdata == leafdata) ? PETSC_TRUE : PETSC_FALSE;
454       if (PetscDefined(USE_DEBUG)) {
455         PetscInt matches[2];
456         PetscInt all_matches[2];
457 
458         matches[0] = match ? 1 : 0;
459         matches[1] = match ? -1 : 0;
460         PetscCallMPI(MPIU_Allreduce(matches, all_matches, 2, MPIU_INT, MPI_MAX, wcomm));
461         all_matches[1] = -all_matches[1];
462         PetscCheck(all_matches[0] == all_matches[1], wcomm, PETSC_ERR_ARG_INCOMP,
463                    "Inconsistent use across MPI processes of persistent leaf and root data registered with PetscSFRegisterPersistent().\n"
464                    "Either the persistent data was changed on a subset of processes (which is not allowed),\n"
465                    "or persistent data was not deregistered with PetscSFDeregisterPersistent() before being deallocated");
466       }
467       if (match) {
468         PetscCheck(!link->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Communication already in progress on persistent root and leaf data");
469         PetscCheck(!epoch || !link->epoch, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Communication epoch already open for window");
470         PetscCheck(bytes == link->bytes, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Wrong data type for persistent root and leaf data");
471         *win = link->win;
472         goto found;
473       }
474     }
475   }
476   for (link = w->wins; reuse && link; link = link->next) {
477     if (w->flavor != link->flavor) continue;
478     /* an existing window can be used (1) if it is not in use, (2) if we are
479        not asking to start an epoch or it does not have an already started
480        epoch, and (3) if it is the right size */
481     if (!link->inuse && (!epoch || !link->epoch) && bytes == (MPI_Aint)link->bytes) {
482       if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
483         PetscCall(PetscSFWindowAttach(sf, link, rootdata, wsize));
484       } else {
485         update = PETSC_TRUE;
486       }
487       link->rootdata = rootdata;
488       link->leafdata = leafdata;
489       PetscCall(PetscInfo(sf, "Reusing window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
490       *win = link->win;
491       goto found;
492     }
493   }
494 
495   PetscCall(PetscNew(&link));
496   link->bytes           = bytes;
497   link->next            = w->wins;
498   link->flavor          = w->flavor;
499   link->dyn_target_addr = NULL;
500   link->reqs            = NULL;
501   w->wins               = link;
502   link->rootdata        = rootdata;
503   link->leafdata        = leafdata;
504   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
505     PetscInt i;
506 
507     PetscCall(PetscMalloc1(sf->nranks, &link->reqs));
508     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
509   }
510   switch (w->flavor) {
511   case PETSCSF_WINDOW_FLAVOR_CREATE:
512     PetscCallMPI(MPI_Win_create(rootdata, wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->win));
513     link->addr = rootdata;
514     break;
515 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
516   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
517     PetscCallMPI(MPI_Win_create_dynamic(w->info, wcomm, &link->win));
518     PetscCall(PetscSFWindowAttach(sf, link, rootdata, wsize));
519     break;
520   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
521     PetscCallMPI(MPI_Win_allocate(wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->addr, &link->win));
522     update = PETSC_TRUE;
523     break;
524 #endif
525 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
526   case PETSCSF_WINDOW_FLAVOR_SHARED:
527     PetscCallMPI(MPI_Win_allocate_shared(wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->addr, &link->win));
528     update = PETSC_TRUE;
529     break;
530 #endif
531   default:
532     SETERRQ(wcomm, PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
533   }
534   PetscCall(PetscInfo(sf, "New window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
535   *win = link->win;
536 
537 found:
538 
539   if (target_disp) *target_disp = link->dyn_target_addr;
540   if (reqs) *reqs = link->reqs;
541   if (update) { /* locks are needed for the "separate" memory model only, the fence guarantees memory-synchronization */
542     PetscMPIInt rank;
543 
544     PetscCallMPI(MPI_Comm_rank(wcomm, &rank));
545     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, MPI_MODE_NOCHECK, *win));
546     PetscCall(PetscMemcpy(link->addr, rootdata, sf->nroots * bytes));
547     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
548       PetscCallMPI(MPI_Win_unlock(rank, *win));
549       PetscCallMPI(MPI_Win_fence(0, *win));
550     }
551   }
552   link->inuse = PETSC_TRUE;
553   link->epoch = epoch;
554   if (epoch) {
555     switch (sync) {
556     case PETSCSF_WINDOW_SYNC_FENCE:
557       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
558       break;
559     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
560       break;
561     case PETSCSF_WINDOW_SYNC_ACTIVE: {
562       MPI_Group   ingroup, outgroup;
563       PetscMPIInt isize, osize;
564 
565       /* Open MPI 4.0.2 with btl=vader does not like calling
566          - MPI_Win_complete when ogroup is empty
567          - MPI_Win_wait when igroup is empty
568          So, we do not even issue the corresponding start and post calls
569          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
570          start(outgroup) has a matching post(ingroup)
571          and this is guaranteed by PetscSF
572       */
573       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
574       PetscCallMPI(MPI_Group_size(ingroup, &isize));
575       PetscCallMPI(MPI_Group_size(outgroup, &osize));
576       if (isize) PetscCallMPI(MPI_Win_post(ingroup, postassert, *win));
577       if (osize) PetscCallMPI(MPI_Win_start(outgroup, startassert, *win));
578     } break;
579     default:
580       SETERRQ(wcomm, PETSC_ERR_PLIB, "Unknown synchronization type");
581     }
582   }
583   PetscFunctionReturn(PETSC_SUCCESS);
584 }
585 
586 /*
587    PetscSFFindWindow - Finds a window that is already in use
588 
589    Not Collective
590 
591    Input Parameters:
592 +  sf - star forest
593 .  unit - data type
594 .  rootdata - array with which the window is associated
595 -  leafdata - only used to help uniquely identify windows
596 
597    Output Parameters:
598 +  win - window
599 -  reqs - outstanding requests associated to the window
600 
601    Level: developer
602 
603 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
604 */
605 static PetscErrorCode PetscSFFindWindow(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, MPI_Win *win, MPI_Request **reqs)
606 {
607   PetscSF_Window *w = (PetscSF_Window *)sf->data;
608   PetscSFWinLink  link;
609   PetscBool       is_empty;
610   MPI_Aint        bytes;
611 
612   PetscFunctionBegin;
613   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
614   *win     = MPI_WIN_NULL;
615   is_empty = w->is_empty;
616   if (is_empty) {
617     *reqs = NULL;
618     *win  = MPI_WIN_NULL;
619     PetscFunctionReturn(PETSC_SUCCESS);
620   }
621   for (link = w->wins; link; link = link->next) {
622     if (rootdata == link->rootdata && leafdata == link->leafdata && bytes == link->bytes) {
623       PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)w->window_comm));
624       *win  = link->win;
625       *reqs = link->reqs;
626       PetscFunctionReturn(PETSC_SUCCESS);
627     }
628   }
629   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
630 }
631 
632 /*
633    PetscSFRestoreWindow - Restores a window obtained with `PetscSFGetWindow()`
634 
635    Collective
636 
637    Input Parameters:
638 +  sf - star forest
639 .  unit - data type
640 .  array - array associated with window
641 .  sync - type of synchronization `PetscSFWindowSyncType`
642 .  epoch - close an epoch, must match argument to `PetscSFGetWindow()`
643 .  update - if we have to update the local window array
644 -  win - window
645 
646    Level: developer
647 
648 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFFindWindow()`
649 */
650 static PetscErrorCode PetscSFRestoreWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscBool update, MPI_Win *win)
651 {
652   PetscSF_Window         *w = (PetscSF_Window *)sf->data;
653   PetscSFWinLink         *p, link;
654   PetscBool               reuse = PETSC_FALSE;
655   PetscSFWindowFlavorType flavor;
656   void                   *laddr;
657   MPI_Aint                bytes;
658   MPI_Comm                wcomm;
659 
660   PetscFunctionBegin;
661   if (*win == MPI_WIN_NULL) PetscFunctionReturn(PETSC_SUCCESS);
662   wcomm = w->window_comm;
663   for (p = &w->wins; *p; p = &(*p)->next) {
664     link = *p;
665     if (*win == link->win) {
666       PetscCheck(array == link->rootdata, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Matched window, but not array");
667       if (epoch != link->epoch) {
668         PetscCheck(!epoch, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "No epoch to end");
669         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Restoring window without ending epoch");
670       }
671       laddr  = link->addr;
672       flavor = link->flavor;
673       bytes  = link->bytes;
674       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
675       else {
676         *p     = link->next;
677         update = PETSC_FALSE;
678       } /* remove from list */
679       goto found;
680     }
681   }
682   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
683 
684 found:
685   PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
686   if (epoch) {
687     switch (sync) {
688     case PETSCSF_WINDOW_SYNC_FENCE:
689       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
690       break;
691     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
692       break;
693     case PETSCSF_WINDOW_SYNC_ACTIVE: {
694       MPI_Group   ingroup, outgroup;
695       PetscMPIInt isize, osize;
696 
697       /* Open MPI 4.0.2 with btl=wader does not like calling
698          - MPI_Win_complete when ogroup is empty
699          - MPI_Win_wait when igroup is empty
700          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
701          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
702          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
703       */
704       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
705       PetscCallMPI(MPI_Group_size(ingroup, &isize));
706       PetscCallMPI(MPI_Group_size(outgroup, &osize));
707       if (osize) PetscCallMPI(MPI_Win_complete(*win));
708       if (isize) PetscCallMPI(MPI_Win_wait(*win));
709     } break;
710     default:
711       SETERRQ(wcomm, PETSC_ERR_PLIB, "Unknown synchronization type");
712     }
713   }
714 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
715   if (link->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC && !link->persistent) {
716     if (link->addr != NULL) PetscCallMPI(MPI_Win_detach(link->win, link->addr));
717     link->addr = NULL;
718   }
719 #endif
720   if (update) {
721     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, *win));
722     PetscCall(PetscMemcpy(array, laddr, sf->nroots * bytes));
723   }
724   link->epoch = PETSC_FALSE;
725   link->inuse = PETSC_FALSE;
726   if (!link->persistent) {
727     link->rootdata = NULL;
728     link->leafdata = NULL;
729   }
730   if (!reuse) {
731     PetscCall(PetscFree(link->dyn_target_addr));
732     PetscCall(PetscFree(link->reqs));
733     PetscCallMPI(MPI_Win_free(&link->win));
734     PetscCall(PetscFree(link));
735     *win = MPI_WIN_NULL;
736   }
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
741 {
742   PetscSF_Window *w = (PetscSF_Window *)sf->data;
743   MPI_Group       ingroup, outgroup;
744   MPI_Comm        comm;
745 
746   PetscFunctionBegin;
747   PetscCall(PetscSFSetUpRanks(sf, MPI_GROUP_EMPTY));
748   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
749   if (w->window_comm == MPI_COMM_NULL) {
750     PetscInt    nroots, nleaves, nranks;
751     PetscBool   has_empty;
752     PetscMPIInt wcommrank;
753     PetscSF     dynsf_full = NULL;
754 
755     if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscCall(PetscSFWindowCreateDynamicSF(sf, &dynsf_full));
756 
757     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, NULL, NULL));
758     has_empty = (nroots == 0 && nleaves == 0) ? PETSC_TRUE : PETSC_FALSE;
759     nranks    = sf->nranks;
760     PetscCall(PetscMalloc1(nranks, &w->wcommranks));
761     w->is_empty = has_empty;
762     PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &has_empty, 1, MPIU_BOOL, MPI_LOR, comm));
763     if (has_empty) {
764       PetscMPIInt  rank;
765       MPI_Comm     raw_comm;
766       PetscSFNode *remotes;
767 
768       PetscCallMPI(MPI_Comm_rank(comm, &rank));
769       PetscCallMPI(MPI_Comm_split(comm, w->is_empty ? 1 : 0, rank, &raw_comm));
770       PetscCall(PetscCommDuplicate(raw_comm, &w->window_comm, NULL));
771       PetscCallMPI(MPI_Comm_free(&raw_comm));
772 
773       PetscCallMPI(MPI_Comm_rank(w->window_comm, &wcommrank));
774       if (!dynsf_full) PetscCall(PetscSFWindowCreateDynamicSF(sf, &dynsf_full));
775       PetscCall(PetscSFBcastBegin(dynsf_full, MPI_INT, &wcommrank, w->wcommranks, MPI_REPLACE));
776       PetscCall(PetscSFBcastEnd(dynsf_full, MPI_INT, &wcommrank, w->wcommranks, MPI_REPLACE));
777 
778       if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
779         PetscCall(PetscSFCreate(w->window_comm, &w->dynsf));
780         PetscCall(PetscSFSetType(w->dynsf, PETSCSFBASIC)); /* break recursion */
781         PetscCall(PetscMalloc1(sf->nranks, &remotes));
782         for (PetscInt i = 0; i < sf->nranks; i++) {
783           remotes[i].rank  = w->wcommranks[i];
784           remotes[i].index = 0;
785         }
786         PetscCall(PetscSFSetGraph(w->dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER));
787       }
788     } else {
789       PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)sf), &w->window_comm, NULL));
790       PetscCall(PetscArraycpy(w->wcommranks, sf->ranks, nranks));
791       PetscCall(PetscObjectReference((PetscObject)dynsf_full));
792       w->dynsf = dynsf_full;
793     }
794     if (w->dynsf) PetscCall(PetscSFSetUp(w->dynsf));
795     PetscCall(PetscSFDestroy(&dynsf_full));
796   }
797   switch (w->sync) {
798   case PETSCSF_WINDOW_SYNC_ACTIVE:
799     PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
800   default:
801     break;
802   }
803   PetscFunctionReturn(PETSC_SUCCESS);
804 }
805 
806 static PetscErrorCode PetscSFSetFromOptions_Window(PetscSF sf, PetscOptionItems *PetscOptionsObject)
807 {
808   PetscSF_Window         *w      = (PetscSF_Window *)sf->data;
809   PetscSFWindowFlavorType flavor = w->flavor;
810 
811   PetscFunctionBegin;
812   PetscOptionsHeadBegin(PetscOptionsObject, "PetscSF Window options");
813   PetscCall(PetscOptionsEnum("-sf_window_sync", "synchronization type to use for PetscSF Window communication", "PetscSFWindowSetSyncType", PetscSFWindowSyncTypes, (PetscEnum)w->sync, (PetscEnum *)&w->sync, NULL));
814   PetscCall(PetscOptionsEnum("-sf_window_flavor", "flavor to use for PetscSF Window creation", "PetscSFWindowSetFlavorType", PetscSFWindowFlavorTypes, (PetscEnum)flavor, (PetscEnum *)&flavor, NULL));
815   PetscCall(PetscSFWindowSetFlavorType(sf, flavor));
816   PetscOptionsHeadEnd();
817   PetscFunctionReturn(PETSC_SUCCESS);
818 }
819 
820 static PetscErrorCode PetscSFReset_Window(PetscSF sf)
821 {
822   PetscSF_Window *w = (PetscSF_Window *)sf->data;
823   PetscSFDataLink link, next;
824   PetscSFWinLink  wlink, wnext;
825   PetscInt        i;
826   MPI_Comm        wcomm;
827   PetscBool       is_empty;
828 
829   PetscFunctionBegin;
830   for (link = w->link; link; link = next) {
831     next = link->next;
832     PetscCallMPI(MPI_Type_free(&link->unit));
833     for (i = 0; i < sf->nranks; i++) {
834       PetscCallMPI(MPI_Type_free(&link->mine[i]));
835       PetscCallMPI(MPI_Type_free(&link->remote[i]));
836     }
837     PetscCall(PetscFree2(link->mine, link->remote));
838     PetscCall(PetscFree(link));
839   }
840   w->link  = NULL;
841   wcomm    = w->window_comm;
842   is_empty = w->is_empty;
843   if (!is_empty) {
844     for (wlink = w->wins; wlink; wlink = wnext) {
845       wnext = wlink->next;
846       PetscCheck(!wlink->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Window still in use with address %p", (void *)wlink->addr);
847       PetscCall(PetscFree(wlink->dyn_target_addr));
848       PetscCall(PetscFree(wlink->reqs));
849       PetscCallMPI(MPI_Win_free(&wlink->win));
850       PetscCall(PetscFree(wlink));
851     }
852   }
853   w->wins = NULL;
854   PetscCall(PetscSFDestroy(&w->dynsf));
855   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
856   PetscCall(PetscCommDestroy(&w->window_comm));
857   PetscCall(PetscFree(w->wcommranks));
858   PetscFunctionReturn(PETSC_SUCCESS);
859 }
860 
861 static PetscErrorCode PetscSFRegisterPersistent_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
862 {
863   PetscSF_Window *w = (PetscSF_Window *)sf->data;
864   MPI_Aint        bytes, wsize;
865   PetscBool       is_empty;
866   PetscSFWinLink  link;
867 
868   PetscFunctionBegin;
869   PetscCall(PetscSFSetUp(sf));
870   if (w->flavor != PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscFunctionReturn(PETSC_SUCCESS);
871   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
872   wsize    = (MPI_Aint)(bytes * sf->nroots);
873   is_empty = w->is_empty;
874   if (is_empty) PetscFunctionReturn(PETSC_SUCCESS);
875   PetscCall(PetscNew(&link));
876   link->flavor = w->flavor;
877   link->next   = w->wins;
878 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
879   {
880     MPI_Comm wcomm = w->window_comm;
881     PetscCallMPI(MPI_Win_create_dynamic(w->info, wcomm, &link->win));
882   }
883 #endif
884   PetscCall(PetscSFWindowAttach(sf, link, (void *)rootdata, wsize));
885   link->rootdata   = (void *)rootdata;
886   link->leafdata   = (void *)leafdata;
887   link->bytes      = bytes;
888   link->epoch      = PETSC_FALSE;
889   link->inuse      = PETSC_FALSE;
890   link->persistent = PETSC_TRUE;
891   w->wins          = link;
892   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
893     PetscInt i;
894 
895     PetscCall(PetscMalloc1(sf->nranks, &link->reqs));
896     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
897   }
898   PetscFunctionReturn(PETSC_SUCCESS);
899 }
900 
901 static PetscErrorCode PetscSFDeregisterPersistent_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
902 {
903   PetscSF_Window *w = (PetscSF_Window *)sf->data;
904   MPI_Aint        bytes;
905   MPI_Comm        wcomm;
906   PetscBool       is_empty;
907   PetscSFWinLink *p;
908 
909   PetscFunctionBegin;
910   PetscCall(PetscSFSetUp(sf));
911   if (w->flavor != PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscFunctionReturn(PETSC_SUCCESS);
912   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
913   wcomm    = w->window_comm;
914   is_empty = w->is_empty;
915   if (is_empty) PetscFunctionReturn(PETSC_SUCCESS);
916   for (p = &w->wins; *p; p = &(*p)->next) {
917     PetscSFWinLink link = *p;
918     if (link->flavor == w->flavor && link->persistent && link->rootdata == rootdata && link->leafdata == leafdata && link->bytes == bytes) {
919       PetscCheck(!link->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Deregistering a window when communication is still in progress");
920       PetscCheck(!link->epoch, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Deregistering a window with an unconcluded epoch");
921 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
922       PetscCallMPI(MPI_Win_detach(link->win, link->addr));
923       link->addr = NULL;
924 #endif
925       PetscCall(PetscFree(link->dyn_target_addr));
926       PetscCall(PetscFree(link->reqs));
927       PetscCallMPI(MPI_Win_free(&link->win));
928       *p = link->next;
929       PetscCall(PetscFree(link));
930       break;
931     }
932   }
933   PetscFunctionReturn(PETSC_SUCCESS);
934 }
935 
936 static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
937 {
938   PetscFunctionBegin;
939   PetscCall(PetscSFReset_Window(sf));
940   PetscCall(PetscFree(sf->data));
941   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", NULL));
942   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", NULL));
943   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", NULL));
944   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", NULL));
945   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", NULL));
946   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", NULL));
947   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFRegisterPersistent_C", NULL));
948   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFDeregisterPersistent_C", NULL));
949   PetscFunctionReturn(PETSC_SUCCESS);
950 }
951 
952 static PetscErrorCode PetscSFView_Window(PetscSF sf, PetscViewer viewer)
953 {
954   PetscSF_Window   *w = (PetscSF_Window *)sf->data;
955   PetscBool         iascii;
956   PetscViewerFormat format;
957 
958   PetscFunctionBegin;
959   PetscCall(PetscViewerGetFormat(viewer, &format));
960   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
961   if (iascii) {
962     PetscCall(PetscViewerASCIIPrintf(viewer, "  current flavor=%s synchronization=%s MultiSF sort=%s\n", PetscSFWindowFlavorTypes[w->flavor], PetscSFWindowSyncTypes[w->sync], sf->rankorder ? "rank-order" : "unordered"));
963     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
964       if (w->info != MPI_INFO_NULL) {
965         PetscMPIInt k, nkeys;
966         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
967 
968         PetscCallMPI(MPI_Info_get_nkeys(w->info, &nkeys));
969         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info with %d keys. Ordered key-value pairs follow:\n", nkeys));
970         for (k = 0; k < nkeys; k++) {
971           PetscMPIInt flag;
972 
973           PetscCallMPI(MPI_Info_get_nthkey(w->info, k, key));
974           PetscCallMPI(MPI_Info_get(w->info, key, MPI_MAX_INFO_VAL, value, &flag));
975           PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing key %s", key);
976           PetscCall(PetscViewerASCIIPrintf(viewer, "      %s = %s\n", key, value));
977         }
978       } else {
979         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info=MPI_INFO_NULL\n"));
980       }
981     }
982   }
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf, PetscSFDuplicateOption opt, PetscSF newsf)
987 {
988   PetscSF_Window       *w = (PetscSF_Window *)sf->data;
989   PetscSFWindowSyncType synctype;
990 
991   PetscFunctionBegin;
992   synctype = w->sync;
993   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
994   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
995   PetscCall(PetscSFWindowSetSyncType(newsf, synctype));
996   PetscCall(PetscSFWindowSetFlavorType(newsf, w->flavor));
997   PetscCall(PetscSFWindowSetInfo(newsf, w->info));
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1002 {
1003   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1004   PetscInt            i, nranks;
1005   const PetscMPIInt  *ranks;
1006   const MPI_Aint     *target_disp;
1007   const MPI_Datatype *mine, *remote;
1008   MPI_Request        *reqs;
1009   MPI_Win             win;
1010 
1011   PetscFunctionBegin;
1012   PetscCheck(op == MPI_REPLACE, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PetscSFBcastBegin_Window with op!=MPI_REPLACE has not been implemented");
1013   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1014   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1015   PetscCall(PetscSFGetWindow(sf, unit, (void *)rootdata, leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT, 0, &target_disp, &reqs, &win));
1016   ranks = w->wcommranks;
1017   for (i = 0; i < nranks; i++) {
1018     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1019 
1020     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
1021       PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
1022 #if defined(PETSC_HAVE_MPI_RGET)
1023       PetscCallMPI(MPI_Rget(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win, &reqs[i]));
1024 #else
1025       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1026 #endif
1027     } else {
1028       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1029     }
1030   }
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 static PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1035 {
1036   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1037   MPI_Win         win;
1038   MPI_Request    *reqs = NULL;
1039 
1040   PetscFunctionBegin;
1041   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1042   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1043   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
1044     PetscInt           i, nranks;
1045     const PetscMPIInt *ranks;
1046 
1047     PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1048     ranks = w->wcommranks;
1049     for (i = 0; i < nranks; i++) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1050   }
1051   PetscCall(PetscSFRestoreWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED, PETSC_FALSE, &win));
1052   PetscFunctionReturn(PETSC_SUCCESS);
1053 }
1054 
1055 static PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1056 {
1057   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1058   PetscInt            i, nranks;
1059   const PetscMPIInt  *ranks;
1060   const MPI_Aint     *target_disp;
1061   const MPI_Datatype *mine, *remote;
1062   MPI_Win             win;
1063 
1064   PetscFunctionBegin;
1065   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1066   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1067   PetscCall(PetscSFWindowOpTranslate(&op));
1068   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
1069   ranks = w->wcommranks;
1070   for (i = 0; i < nranks; i++) {
1071     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1072 
1073     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
1074     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1075     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1076   }
1077   PetscFunctionReturn(PETSC_SUCCESS);
1078 }
1079 
1080 static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1081 {
1082   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1083   MPI_Win         win;
1084   MPI_Request    *reqs = NULL;
1085 
1086   PetscFunctionBegin;
1087   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1088   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1089   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
1090   PetscFunctionReturn(PETSC_SUCCESS);
1091 }
1092 
1093 static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
1094 {
1095   PetscInt            i, nranks;
1096   const PetscMPIInt  *ranks;
1097   const MPI_Datatype *mine, *remote;
1098   const MPI_Aint     *target_disp;
1099   MPI_Win             win;
1100   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1101 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1102   PetscSFWindowFlavorType oldf;
1103 #endif
1104 
1105   PetscFunctionBegin;
1106   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1107   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1108   PetscCall(PetscSFWindowOpTranslate(&op));
1109 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1110   /* FetchAndOp without MPI_Get_Accumulate requires locking.
1111      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
1112   oldf      = w->flavor;
1113   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
1114   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, 0, 0, &target_disp, NULL, &win));
1115 #else
1116   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
1117 #endif
1118   ranks = w->wcommranks;
1119   for (i = 0; i < nranks; i++) {
1120     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1121 
1122 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1123     PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, ranks[i], 0, win));
1124     PetscCallMPI(MPI_Get(leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1125     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1126     PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1127 #else
1128     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], 0, win));
1129     PetscCallMPI(MPI_Get_accumulate((void *)leafdata, 1, mine[i], leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1130     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1131 #endif
1132   }
1133 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1134   w->flavor = oldf;
1135 #endif
1136   PetscFunctionReturn(PETSC_SUCCESS);
1137 }
1138 
1139 static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1140 {
1141   MPI_Win win;
1142 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1143   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1144 #endif
1145   MPI_Request *reqs = NULL;
1146 
1147   PetscFunctionBegin;
1148   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1149   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1150 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1151   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
1152 #else
1153   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, PETSC_TRUE, &win));
1154 #endif
1155   PetscFunctionReturn(PETSC_SUCCESS);
1156 }
1157 
1158 PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
1159 {
1160   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1161 
1162   PetscFunctionBegin;
1163   sf->ops->SetUp           = PetscSFSetUp_Window;
1164   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
1165   sf->ops->Reset           = PetscSFReset_Window;
1166   sf->ops->Destroy         = PetscSFDestroy_Window;
1167   sf->ops->View            = PetscSFView_Window;
1168   sf->ops->Duplicate       = PetscSFDuplicate_Window;
1169   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
1170   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
1171   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
1172   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
1173   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
1174   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;
1175 
1176   PetscCall(PetscNew(&w));
1177   sf->data       = (void *)w;
1178   w->sync        = PETSCSF_WINDOW_SYNC_FENCE;
1179   w->flavor      = PETSCSF_WINDOW_FLAVOR_CREATE;
1180   w->info        = MPI_INFO_NULL;
1181   w->window_comm = MPI_COMM_NULL;
1182 
1183   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", PetscSFWindowSetSyncType_Window));
1184   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", PetscSFWindowGetSyncType_Window));
1185   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", PetscSFWindowSetFlavorType_Window));
1186   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", PetscSFWindowGetFlavorType_Window));
1187   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", PetscSFWindowSetInfo_Window));
1188   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", PetscSFWindowGetInfo_Window));
1189   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFRegisterPersistent_C", PetscSFRegisterPersistent_Window));
1190   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFDeregisterPersistent_C", PetscSFDeregisterPersistent_Window));
1191 
1192 #if defined(PETSC_HAVE_OPENMPI)
1193   #if PETSC_PKG_OPENMPI_VERSION_LE(1, 6, 0)
1194   {
1195     PetscBool ackbug = PETSC_FALSE;
1196     PetscCall(PetscOptionsGetBool(NULL, NULL, "-acknowledge_ompi_onesided_bug", &ackbug, NULL));
1197     if (ackbug) {
1198       PetscCall(PetscInfo(sf, "Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n"));
1199     } else SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_LIB, "Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
1200   }
1201   #endif
1202 #endif
1203   PetscFunctionReturn(PETSC_SUCCESS);
1204 }
1205