xref: /petsc/src/vec/is/sf/impls/window/sfwindow.c (revision ccfb0f9f40a0131988d7995ed9679700dae2a75a)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 
3 typedef struct _n_PetscSFDataLink *PetscSFDataLink;
4 typedef struct _n_PetscSFWinLink  *PetscSFWinLink;
5 
6 typedef struct {
7   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
8   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
9   PetscSFWinLink          wins;   /* List of active windows */
10   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
11   PetscSF                 dynsf;
12   MPI_Info                info;
13   MPI_Comm                window_comm;
14   PetscBool               is_empty;
15   PetscMPIInt            *wcommranks;
16 } PetscSF_Window;
17 
18 struct _n_PetscSFDataLink {
19   MPI_Datatype    unit;
20   MPI_Datatype   *mine;
21   MPI_Datatype   *remote;
22   PetscSFDataLink next;
23 };
24 
25 struct _n_PetscSFWinLink {
26   PetscBool               inuse;
27   MPI_Aint                bytes;
28   void                   *addr;
29   void                   *rootdata;
30   void                   *leafdata;
31   MPI_Win                 win;
32   MPI_Request            *reqs;
33   PetscSFWindowFlavorType flavor;
34   MPI_Aint               *dyn_target_addr;
35   PetscBool               epoch;
36   PetscBool               persistent;
37   PetscSFWinLink          next;
38 };
39 
40 const char *const PetscSFWindowSyncTypes[]   = {"FENCE", "LOCK", "ACTIVE", "PetscSFWindowSyncType", "PETSCSF_WINDOW_SYNC_", NULL};
41 const char *const PetscSFWindowFlavorTypes[] = {"CREATE", "DYNAMIC", "ALLOCATE", "SHARED", "PetscSFWindowFlavorType", "PETSCSF_WINDOW_FLAVOR_", NULL};
42 
43 /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
44 static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
45 {
46   PetscFunctionBegin;
47   if (*op == MPIU_SUM) *op = MPI_SUM;
48   else if (*op == MPIU_MAX) *op = MPI_MAX;
49   else if (*op == MPIU_MIN) *op = MPI_MIN;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 /*
54    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank
55 
56    Not Collective
57 
58    Input Parameters:
59 +  sf - star forest of type `PETSCSFWINDOW`
60 -  unit - data type for each node
61 
62    Output Parameters:
63 +  localtypes - types describing part of local leaf buffer referencing each remote rank
64 -  remotetypes - types describing part of remote root buffer referenced for each remote rank
65 
66    Level: developer
67 
68 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetGraph()`, `PetscSFView()`
69 @*/
70 static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf, MPI_Datatype unit, const MPI_Datatype **localtypes, const MPI_Datatype **remotetypes)
71 {
72   PetscSF_Window *w = (PetscSF_Window *)sf->data;
73   PetscSFDataLink link;
74   PetscMPIInt     nranks;
75   const PetscInt *roffset;
76 
77   PetscFunctionBegin;
78   /* Look for types in cache */
79   for (link = w->link; link; link = link->next) {
80     PetscBool match;
81 
82     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
83     if (match) {
84       *localtypes  = link->mine;
85       *remotetypes = link->remote;
86       PetscFunctionReturn(PETSC_SUCCESS);
87     }
88   }
89 
90   /* Create new composite types for each send rank */
91   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, &roffset, NULL, NULL));
92   PetscCall(PetscNew(&link));
93   PetscCallMPI(MPI_Type_dup(unit, &link->unit));
94   PetscCall(PetscMalloc2(nranks, &link->mine, nranks, &link->remote));
95   for (PetscMPIInt i = 0; i < nranks; i++) {
96     PetscMPIInt  rcount;
97     PetscMPIInt *rmine, *rremote;
98 
99     PetscCall(PetscMPIIntCast(roffset[i + 1] - roffset[i], &rcount));
100 #if !defined(PETSC_USE_64BIT_INDICES)
101     rmine   = sf->rmine + sf->roffset[i];
102     rremote = sf->rremote + sf->roffset[i];
103 #else
104     PetscCall(PetscMalloc2(rcount, &rmine, rcount, &rremote));
105     for (PetscInt j = 0; j < rcount; j++) {
106       PetscCall(PetscMPIIntCast(sf->rmine[sf->roffset[i] + j], &rmine[j]));
107       PetscCall(PetscMPIIntCast(sf->rremote[sf->roffset[i] + j], &rremote[j]));
108     }
109 #endif
110 
111     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rmine, link->unit, &link->mine[i]));
112     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rremote, link->unit, &link->remote[i]));
113 #if defined(PETSC_USE_64BIT_INDICES)
114     PetscCall(PetscFree2(rmine, rremote));
115 #endif
116     PetscCallMPI(MPI_Type_commit(&link->mine[i]));
117     PetscCallMPI(MPI_Type_commit(&link->remote[i]));
118   }
119   link->next = w->link;
120   w->link    = link;
121 
122   *localtypes  = link->mine;
123   *remotetypes = link->remote;
124   PetscFunctionReturn(PETSC_SUCCESS);
125 }
126 
127 /*@
128   PetscSFWindowSetFlavorType - Set flavor type for `MPI_Win` creation
129 
130   Logically Collective
131 
132   Input Parameters:
133 + sf     - star forest for communication of type `PETSCSFWINDOW`
134 - flavor - flavor type
135 
136   Options Database Key:
137 . -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see `PetscSFWindowFlavorType`)
138 
139   Level: advanced
140 
141   Notes:
142   Windows reuse follows these rules\:
143 .vb
144      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create
145 
146      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
147        PetscSFRegisterPersistent(sf,rootdata1,leafdata);
148        for i=1 to K
149          PetscSFOperationBegin(sf,rootdata1,leafdata);
150          PetscSFOperationEnd(sf,rootdata1,leafdata);
151          ...
152          PetscSFOperationBegin(sf,rootdata1,leafdata);
153          PetscSFOperationEnd(sf,rootdata1,leafdata);
154        endfor
155        PetscSFDeregisterPersistent(sf,rootdata1,leafdata);
156 
157        The following pattern will instead raise an error
158          PetscSFOperationBegin(sf,rootdata1,leafdata);
159          PetscSFOperationEnd(sf,rootdata1,leafdata);
160          PetscSFOperationBegin(sf,rank ? rootdata1 : rootdata2,leafdata);
161          PetscSFOperationEnd(sf,rank ? rootdata1 : rootdata2,leafdata);
162 
163      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use
164 
165      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE
166 .ve
167 
168 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetFlavorType()`
169 @*/
170 PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf, PetscSFWindowFlavorType flavor)
171 {
172   PetscFunctionBegin;
173   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
174   PetscValidLogicalCollectiveEnum(sf, flavor, 2);
175   PetscTryMethod(sf, "PetscSFWindowSetFlavorType_C", (PetscSF, PetscSFWindowFlavorType), (sf, flavor));
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType flavor)
180 {
181   PetscSF_Window *w = (PetscSF_Window *)sf->data;
182 
183   PetscFunctionBegin;
184   w->flavor = flavor;
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 /*@
189   PetscSFWindowGetFlavorType - Get  `PETSCSFWINDOW` flavor type for `PetscSF` communication
190 
191   Logically Collective
192 
193   Input Parameter:
194 . sf - star forest for communication of type `PETSCSFWINDOW`
195 
196   Output Parameter:
197 . flavor - flavor type
198 
199   Level: advanced
200 
201 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetFlavorType()`
202 @*/
203 PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf, PetscSFWindowFlavorType *flavor)
204 {
205   PetscFunctionBegin;
206   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
207   PetscAssertPointer(flavor, 2);
208   PetscUseMethod(sf, "PetscSFWindowGetFlavorType_C", (PetscSF, PetscSFWindowFlavorType *), (sf, flavor));
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType *flavor)
213 {
214   PetscSF_Window *w = (PetscSF_Window *)sf->data;
215 
216   PetscFunctionBegin;
217   *flavor = w->flavor;
218   PetscFunctionReturn(PETSC_SUCCESS);
219 }
220 
221 /*@
222   PetscSFWindowSetSyncType - Set synchronization type for `PetscSF` communication of type  `PETSCSFWINDOW`
223 
224   Logically Collective
225 
226   Input Parameters:
227 + sf   - star forest for communication
228 - sync - synchronization type
229 
230   Options Database Key:
231 . -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see `PetscSFWindowSyncType`)
232 
233   Level: advanced
234 
235 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetSyncType()`, `PetscSFWindowSyncType`
236 @*/
237 PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf, PetscSFWindowSyncType sync)
238 {
239   PetscFunctionBegin;
240   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
241   PetscValidLogicalCollectiveEnum(sf, sync, 2);
242   PetscTryMethod(sf, "PetscSFWindowSetSyncType_C", (PetscSF, PetscSFWindowSyncType), (sf, sync));
243   PetscFunctionReturn(PETSC_SUCCESS);
244 }
245 
246 static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf, PetscSFWindowSyncType sync)
247 {
248   PetscSF_Window *w = (PetscSF_Window *)sf->data;
249 
250   PetscFunctionBegin;
251   w->sync = sync;
252   PetscFunctionReturn(PETSC_SUCCESS);
253 }
254 
255 /*@
256   PetscSFWindowGetSyncType - Get synchronization type for `PetscSF` communication of type `PETSCSFWINDOW`
257 
258   Logically Collective
259 
260   Input Parameter:
261 . sf - star forest for communication
262 
263   Output Parameter:
264 . sync - synchronization type
265 
266   Level: advanced
267 
268 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetSyncType()`, `PetscSFWindowSyncType`
269 @*/
270 PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf, PetscSFWindowSyncType *sync)
271 {
272   PetscFunctionBegin;
273   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
274   PetscAssertPointer(sync, 2);
275   PetscUseMethod(sf, "PetscSFWindowGetSyncType_C", (PetscSF, PetscSFWindowSyncType *), (sf, sync));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf, PetscSFWindowSyncType *sync)
280 {
281   PetscSF_Window *w = (PetscSF_Window *)sf->data;
282 
283   PetscFunctionBegin;
284   *sync = w->sync;
285   PetscFunctionReturn(PETSC_SUCCESS);
286 }
287 
288 /*@C
289   PetscSFWindowSetInfo - Set the `MPI_Info` handle that will be used for subsequent windows allocation
290 
291   Logically Collective
292 
293   Input Parameters:
294 + sf   - star forest for communication
295 - info - `MPI_Info` handle
296 
297   Level: advanced
298 
299   Note:
300   The info handle is duplicated with a call to `MPI_Info_dup()` unless info = `MPI_INFO_NULL`.
301 
302 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetInfo()`
303 @*/
304 PetscErrorCode PetscSFWindowSetInfo(PetscSF sf, MPI_Info info)
305 {
306   PetscFunctionBegin;
307   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
308   PetscTryMethod(sf, "PetscSFWindowSetInfo_C", (PetscSF, MPI_Info), (sf, info));
309   PetscFunctionReturn(PETSC_SUCCESS);
310 }
311 
312 static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf, MPI_Info info)
313 {
314   PetscSF_Window *w = (PetscSF_Window *)sf->data;
315 
316   PetscFunctionBegin;
317   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
318   if (info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_dup(info, &w->info));
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 /*@C
323   PetscSFWindowGetInfo - Get the `MPI_Info` handle used for windows allocation
324 
325   Logically Collective
326 
327   Input Parameter:
328 . sf - star forest for communication
329 
330   Output Parameter:
331 . info - `MPI_Info` handle
332 
333   Level: advanced
334 
335   Note:
336   If `PetscSFWindowSetInfo()` has not be called, this returns `MPI_INFO_NULL`
337 
338 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetInfo()`
339 @*/
340 PetscErrorCode PetscSFWindowGetInfo(PetscSF sf, MPI_Info *info)
341 {
342   PetscFunctionBegin;
343   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
344   PetscAssertPointer(info, 2);
345   PetscUseMethod(sf, "PetscSFWindowGetInfo_C", (PetscSF, MPI_Info *), (sf, info));
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf, MPI_Info *info)
350 {
351   PetscSF_Window *w = (PetscSF_Window *)sf->data;
352 
353   PetscFunctionBegin;
354   *info = w->info;
355   PetscFunctionReturn(PETSC_SUCCESS);
356 }
357 
358 static PetscErrorCode PetscSFWindowCreateDynamicSF(PetscSF sf, PetscSF *dynsf)
359 {
360   PetscSFNode *remotes;
361 
362   PetscFunctionBegin;
363   PetscCall(PetscMalloc1(sf->nranks, &remotes));
364   for (PetscInt i = 0; i < sf->nranks; i++) {
365     remotes[i].rank  = sf->ranks[i];
366     remotes[i].index = 0;
367   }
368   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, dynsf));
369   PetscCall(PetscSFSetType(*dynsf, PETSCSFBASIC)); /* break recursion */
370   PetscCall(PetscSFSetGraph(*dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER));
371   PetscFunctionReturn(PETSC_SUCCESS);
372 }
373 
374 static PetscErrorCode PetscSFWindowAttach(PetscSF sf, PetscSFWinLink link, void *rootdata, size_t wsize)
375 {
376   PetscFunctionBegin;
377 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
378   {
379     PetscSF_Window *w = (PetscSF_Window *)sf->data;
380     MPI_Comm        wcomm;
381     MPI_Aint        winaddr;
382     void           *addr = rootdata;
383     PetscMPIInt     nranks;
384     // some Open MPI versions do not support MPI_Win_attach(win,NULL,0);
385     wcomm = w->window_comm;
386     if (addr != NULL) PetscCallMPI(MPI_Win_attach(link->win, addr, wsize));
387     link->addr = addr;
388     PetscCheck(w->dynsf, wcomm, PETSC_ERR_ORDER, "Must call PetscSFSetUp()");
389     PetscCall(PetscSFGetRootRanks(w->dynsf, &nranks, NULL, NULL, NULL, NULL));
390     PetscCallMPI(MPI_Get_address(addr, &winaddr));
391     if (!link->dyn_target_addr) PetscCall(PetscMalloc1(nranks, &link->dyn_target_addr));
392     PetscCall(PetscSFBcastBegin(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
393     PetscCall(PetscSFBcastEnd(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
394   }
395 #else
396   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "dynamic windows not supported");
397 #endif
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 /*
402    PetscSFGetWindow - Get a window for use with a given data type
403 
404    Collective
405 
406    Input Parameters:
407 +  sf - star forest
408 .  unit - data type
409 .  rootdata - array to be sent
410 .  leafdata - only used to help uniquely identify windows
411 .  sync - type of synchronization `PetscSFWindowSyncType`
412 .  epoch - `PETSC_TRUE` to acquire the window and start an epoch, `PETSC_FALSE` to just acquire the window
413 .  fenceassert - assert parameter for call to `MPI_Win_fence()`, if sync == `PETSCSF_WINDOW_SYNC_FENCE`
414 .  postassert - assert parameter for call to `MPI_Win_post()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
415 -  startassert - assert parameter for call to `MPI_Win_start()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
416 
417    Output Parameters:
418 +  target_disp - target_disp argument for RMA calls (significative for `PETSCSF_WINDOW_FLAVOR_DYNAMIC` only)
419 +  reqs - array of requests (significative for sync == `PETSCSF_WINDOW_SYNC_LOCK` only)
420 -  win - window
421 
422    Level: developer
423 
424 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetRootRanks()`, `PetscSFWindowGetDataTypes()`
425 */
426 
427 static PetscErrorCode PetscSFGetWindow(PetscSF sf, MPI_Datatype unit, void *rootdata, void *leafdata, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscMPIInt postassert, PetscMPIInt startassert, const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
428 {
429   PetscSF_Window *w = (PetscSF_Window *)sf->data;
430   MPI_Aint        bytes;
431   PetscSFWinLink  link;
432   PetscBool       reuse = PETSC_FALSE, update = PETSC_FALSE;
433   MPI_Aint        wsize;
434   MPI_Comm        wcomm;
435   PetscBool       is_empty;
436 
437   PetscFunctionBegin;
438   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
439   wsize    = (MPI_Aint)(bytes * sf->nroots);
440   wcomm    = w->window_comm;
441   is_empty = w->is_empty;
442   if (is_empty) {
443     if (target_disp) *target_disp = NULL;
444     if (reqs) *reqs = NULL;
445     *win = MPI_WIN_NULL;
446     PetscFunctionReturn(PETSC_SUCCESS);
447   }
448   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
449   if (PetscDefined(HAVE_MPI_FEATURE_DYNAMIC_WINDOW) && w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
450     // first search for a persistent window
451     for (link = w->wins; reuse && link; link = link->next) {
452       PetscBool match;
453 
454       if (!link->persistent) continue;
455       match = (link->flavor == w->flavor && link->rootdata == rootdata && link->leafdata == leafdata) ? PETSC_TRUE : PETSC_FALSE;
456       if (PetscDefined(USE_DEBUG)) {
457         PetscInt matches[2];
458         PetscInt all_matches[2];
459 
460         matches[0] = match ? 1 : 0;
461         matches[1] = match ? -1 : 0;
462         PetscCallMPI(MPIU_Allreduce(matches, all_matches, 2, MPIU_INT, MPI_MAX, wcomm));
463         all_matches[1] = -all_matches[1];
464         PetscCheck(all_matches[0] == all_matches[1], wcomm, PETSC_ERR_ARG_INCOMP,
465                    "Inconsistent use across MPI processes of persistent leaf and root data registered with PetscSFRegisterPersistent().\n"
466                    "Either the persistent data was changed on a subset of processes (which is not allowed),\n"
467                    "or persistent data was not deregistered with PetscSFDeregisterPersistent() before being deallocated");
468       }
469       if (match) {
470         PetscCheck(!link->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Communication already in progress on persistent root and leaf data");
471         PetscCheck(!epoch || !link->epoch, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Communication epoch already open for window");
472         PetscCheck(bytes == link->bytes, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Wrong data type for persistent root and leaf data");
473         *win = link->win;
474         goto found;
475       }
476     }
477   }
478   for (link = w->wins; reuse && link; link = link->next) {
479     if (w->flavor != link->flavor) continue;
480     /* an existing window can be used (1) if it is not in use, (2) if we are
481        not asking to start an epoch or it does not have an already started
482        epoch, and (3) if it is the right size */
483     if (!link->inuse && (!epoch || !link->epoch) && bytes == (MPI_Aint)link->bytes) {
484       if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
485         PetscCall(PetscSFWindowAttach(sf, link, rootdata, wsize));
486       } else {
487         update = PETSC_TRUE;
488       }
489       link->rootdata = rootdata;
490       link->leafdata = leafdata;
491       PetscCall(PetscInfo(sf, "Reusing window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
492       *win = link->win;
493       goto found;
494     }
495   }
496 
497   PetscCall(PetscNew(&link));
498   link->bytes           = bytes;
499   link->next            = w->wins;
500   link->flavor          = w->flavor;
501   link->dyn_target_addr = NULL;
502   link->reqs            = NULL;
503   w->wins               = link;
504   link->rootdata        = rootdata;
505   link->leafdata        = leafdata;
506   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
507     PetscCall(PetscMalloc1(sf->nranks, &link->reqs));
508     for (PetscMPIInt i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
509   }
510   switch (w->flavor) {
511   case PETSCSF_WINDOW_FLAVOR_CREATE:
512     PetscCallMPI(MPI_Win_create(rootdata, wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->win));
513     link->addr = rootdata;
514     break;
515 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
516   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
517     PetscCallMPI(MPI_Win_create_dynamic(w->info, wcomm, &link->win));
518     PetscCall(PetscSFWindowAttach(sf, link, rootdata, wsize));
519     break;
520   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
521     PetscCallMPI(MPI_Win_allocate(wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->addr, &link->win));
522     update = PETSC_TRUE;
523     break;
524 #endif
525 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
526   case PETSCSF_WINDOW_FLAVOR_SHARED:
527     PetscCallMPI(MPI_Win_allocate_shared(wsize, (PetscMPIInt)bytes, w->info, wcomm, &link->addr, &link->win));
528     update = PETSC_TRUE;
529     break;
530 #endif
531   default:
532     SETERRQ(wcomm, PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
533   }
534   PetscCall(PetscInfo(sf, "New window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
535   *win = link->win;
536 
537 found:
538 
539   if (target_disp) *target_disp = link->dyn_target_addr;
540   if (reqs) *reqs = link->reqs;
541   if (update) { /* locks are needed for the "separate" memory model only, the fence guarantees memory-synchronization */
542     PetscMPIInt rank;
543 
544     PetscCallMPI(MPI_Comm_rank(wcomm, &rank));
545     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, MPI_MODE_NOCHECK, *win));
546     PetscCall(PetscMemcpy(link->addr, rootdata, sf->nroots * bytes));
547     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
548       PetscCallMPI(MPI_Win_unlock(rank, *win));
549       PetscCallMPI(MPI_Win_fence(0, *win));
550     }
551   }
552   link->inuse = PETSC_TRUE;
553   link->epoch = epoch;
554   if (epoch) {
555     switch (sync) {
556     case PETSCSF_WINDOW_SYNC_FENCE:
557       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
558       break;
559     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
560       break;
561     case PETSCSF_WINDOW_SYNC_ACTIVE: {
562       MPI_Group   ingroup, outgroup;
563       PetscMPIInt isize, osize;
564 
565       /* Open MPI 4.0.2 with btl=vader does not like calling
566          - MPI_Win_complete when ogroup is empty
567          - MPI_Win_wait when igroup is empty
568          So, we do not even issue the corresponding start and post calls
569          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
570          start(outgroup) has a matching post(ingroup)
571          and this is guaranteed by PetscSF
572       */
573       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
574       PetscCallMPI(MPI_Group_size(ingroup, &isize));
575       PetscCallMPI(MPI_Group_size(outgroup, &osize));
576       if (isize) PetscCallMPI(MPI_Win_post(ingroup, postassert, *win));
577       if (osize) PetscCallMPI(MPI_Win_start(outgroup, startassert, *win));
578     } break;
579     default:
580       SETERRQ(wcomm, PETSC_ERR_PLIB, "Unknown synchronization type");
581     }
582   }
583   PetscFunctionReturn(PETSC_SUCCESS);
584 }
585 
586 /*
587    PetscSFFindWindow - Finds a window that is already in use
588 
589    Not Collective
590 
591    Input Parameters:
592 +  sf - star forest
593 .  unit - data type
594 .  rootdata - array with which the window is associated
595 -  leafdata - only used to help uniquely identify windows
596 
597    Output Parameters:
598 +  win - window
599 -  reqs - outstanding requests associated to the window
600 
601    Level: developer
602 
603 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
604 */
605 static PetscErrorCode PetscSFFindWindow(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, MPI_Win *win, MPI_Request **reqs)
606 {
607   PetscSF_Window *w = (PetscSF_Window *)sf->data;
608   PetscSFWinLink  link;
609   PetscBool       is_empty;
610   MPI_Aint        bytes;
611 
612   PetscFunctionBegin;
613   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
614   *win     = MPI_WIN_NULL;
615   is_empty = w->is_empty;
616   if (is_empty) {
617     *reqs = NULL;
618     *win  = MPI_WIN_NULL;
619     PetscFunctionReturn(PETSC_SUCCESS);
620   }
621   for (link = w->wins; link; link = link->next) {
622     if (rootdata == link->rootdata && leafdata == link->leafdata && bytes == link->bytes) {
623       PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)w->window_comm));
624       *win  = link->win;
625       *reqs = link->reqs;
626       PetscFunctionReturn(PETSC_SUCCESS);
627     }
628   }
629   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
630 }
631 
632 /*
633    PetscSFRestoreWindow - Restores a window obtained with `PetscSFGetWindow()`
634 
635    Collective
636 
637    Input Parameters:
638 +  sf - star forest
639 .  unit - data type
640 .  array - array associated with window
641 .  sync - type of synchronization `PetscSFWindowSyncType`
642 .  epoch - close an epoch, must match argument to `PetscSFGetWindow()`
643 .  update - if we have to update the local window array
644 -  win - window
645 
646    Level: developer
647 
648 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFFindWindow()`
649 */
650 static PetscErrorCode PetscSFRestoreWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscBool update, MPI_Win *win)
651 {
652   PetscSF_Window         *w = (PetscSF_Window *)sf->data;
653   PetscSFWinLink         *p, link;
654   PetscBool               reuse = PETSC_FALSE;
655   PetscSFWindowFlavorType flavor;
656   void                   *laddr;
657   MPI_Aint                bytes;
658   MPI_Comm                wcomm;
659 
660   PetscFunctionBegin;
661   if (*win == MPI_WIN_NULL) PetscFunctionReturn(PETSC_SUCCESS);
662   wcomm = w->window_comm;
663   for (p = &w->wins; *p; p = &(*p)->next) {
664     link = *p;
665     if (*win == link->win) {
666       PetscCheck(array == link->rootdata, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Matched window, but not array");
667       if (epoch != link->epoch) {
668         PetscCheck(!epoch, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "No epoch to end");
669         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Restoring window without ending epoch");
670       }
671       laddr  = link->addr;
672       flavor = link->flavor;
673       bytes  = link->bytes;
674       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
675       else {
676         *p     = link->next;
677         update = PETSC_FALSE;
678       } /* remove from list */
679       goto found;
680     }
681   }
682   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
683 
684 found:
685   PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)wcomm));
686   if (epoch) {
687     switch (sync) {
688     case PETSCSF_WINDOW_SYNC_FENCE:
689       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
690       break;
691     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
692       break;
693     case PETSCSF_WINDOW_SYNC_ACTIVE: {
694       MPI_Group   ingroup, outgroup;
695       PetscMPIInt isize, osize;
696 
697       /* Open MPI 4.0.2 with btl=wader does not like calling
698          - MPI_Win_complete when ogroup is empty
699          - MPI_Win_wait when igroup is empty
700          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
701          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
702          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
703       */
704       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
705       PetscCallMPI(MPI_Group_size(ingroup, &isize));
706       PetscCallMPI(MPI_Group_size(outgroup, &osize));
707       if (osize) PetscCallMPI(MPI_Win_complete(*win));
708       if (isize) PetscCallMPI(MPI_Win_wait(*win));
709     } break;
710     default:
711       SETERRQ(wcomm, PETSC_ERR_PLIB, "Unknown synchronization type");
712     }
713   }
714 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
715   if (link->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC && !link->persistent) {
716     if (link->addr != NULL) PetscCallMPI(MPI_Win_detach(link->win, link->addr));
717     link->addr = NULL;
718   }
719 #endif
720   if (update) {
721     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, *win));
722     PetscCall(PetscMemcpy(array, laddr, sf->nroots * bytes));
723   }
724   link->epoch = PETSC_FALSE;
725   link->inuse = PETSC_FALSE;
726   if (!link->persistent) {
727     link->rootdata = NULL;
728     link->leafdata = NULL;
729   }
730   if (!reuse) {
731     PetscCall(PetscFree(link->dyn_target_addr));
732     PetscCall(PetscFree(link->reqs));
733     PetscCallMPI(MPI_Win_free(&link->win));
734     PetscCall(PetscFree(link));
735     *win = MPI_WIN_NULL;
736   }
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
741 {
742   PetscSF_Window *w = (PetscSF_Window *)sf->data;
743   MPI_Group       ingroup, outgroup;
744   MPI_Comm        comm;
745 
746   PetscFunctionBegin;
747   PetscCall(PetscSFSetUpRanks(sf, MPI_GROUP_EMPTY));
748   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
749   if (w->window_comm == MPI_COMM_NULL) {
750     PetscInt    nroots, nleaves, nranks;
751     PetscBool   has_empty;
752     PetscMPIInt wcommrank;
753     PetscSF     dynsf_full = NULL;
754 
755     if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscCall(PetscSFWindowCreateDynamicSF(sf, &dynsf_full));
756 
757     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, NULL, NULL));
758     has_empty = (nroots == 0 && nleaves == 0) ? PETSC_TRUE : PETSC_FALSE;
759     nranks    = sf->nranks;
760     PetscCall(PetscMalloc1(nranks, &w->wcommranks));
761     w->is_empty = has_empty;
762     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &has_empty, 1, MPI_C_BOOL, MPI_LOR, comm));
763     if (has_empty) {
764       PetscMPIInt  rank;
765       MPI_Comm     raw_comm;
766       PetscSFNode *remotes;
767 
768       PetscCallMPI(MPI_Comm_rank(comm, &rank));
769       PetscCallMPI(MPI_Comm_split(comm, w->is_empty ? 1 : 0, rank, &raw_comm));
770       PetscCall(PetscCommDuplicate(raw_comm, &w->window_comm, NULL));
771       PetscCallMPI(MPI_Comm_free(&raw_comm));
772 
773       PetscCallMPI(MPI_Comm_rank(w->window_comm, &wcommrank));
774       if (!dynsf_full) PetscCall(PetscSFWindowCreateDynamicSF(sf, &dynsf_full));
775       PetscCall(PetscSFBcastBegin(dynsf_full, MPI_INT, &wcommrank, w->wcommranks, MPI_REPLACE));
776       PetscCall(PetscSFBcastEnd(dynsf_full, MPI_INT, &wcommrank, w->wcommranks, MPI_REPLACE));
777 
778       if (w->flavor == PETSCSF_WINDOW_FLAVOR_DYNAMIC) {
779         PetscCall(PetscSFCreate(w->window_comm, &w->dynsf));
780         PetscCall(PetscSFSetType(w->dynsf, PETSCSFBASIC)); /* break recursion */
781         PetscCall(PetscMalloc1(sf->nranks, &remotes));
782         for (PetscInt i = 0; i < sf->nranks; i++) {
783           remotes[i].rank  = w->wcommranks[i];
784           remotes[i].index = 0;
785         }
786         PetscCall(PetscSFSetGraph(w->dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER));
787       }
788     } else {
789       PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)sf), &w->window_comm, NULL));
790       PetscCall(PetscArraycpy(w->wcommranks, sf->ranks, nranks));
791       PetscCall(PetscObjectReference((PetscObject)dynsf_full));
792       w->dynsf = dynsf_full;
793     }
794     if (w->dynsf) PetscCall(PetscSFSetUp(w->dynsf));
795     PetscCall(PetscSFDestroy(&dynsf_full));
796   }
797   switch (w->sync) {
798   case PETSCSF_WINDOW_SYNC_ACTIVE:
799     PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
800   default:
801     break;
802   }
803   PetscFunctionReturn(PETSC_SUCCESS);
804 }
805 
806 static PetscErrorCode PetscSFSetFromOptions_Window(PetscSF sf, PetscOptionItems PetscOptionsObject)
807 {
808   PetscSF_Window         *w      = (PetscSF_Window *)sf->data;
809   PetscSFWindowFlavorType flavor = w->flavor;
810 
811   PetscFunctionBegin;
812   PetscOptionsHeadBegin(PetscOptionsObject, "PetscSF Window options");
813   PetscCall(PetscOptionsEnum("-sf_window_sync", "synchronization type to use for PetscSF Window communication", "PetscSFWindowSetSyncType", PetscSFWindowSyncTypes, (PetscEnum)w->sync, (PetscEnum *)&w->sync, NULL));
814   PetscCall(PetscOptionsEnum("-sf_window_flavor", "flavor to use for PetscSF Window creation", "PetscSFWindowSetFlavorType", PetscSFWindowFlavorTypes, (PetscEnum)flavor, (PetscEnum *)&flavor, NULL));
815   PetscCall(PetscSFWindowSetFlavorType(sf, flavor));
816   PetscOptionsHeadEnd();
817   PetscFunctionReturn(PETSC_SUCCESS);
818 }
819 
820 static PetscErrorCode PetscSFReset_Window(PetscSF sf)
821 {
822   PetscSF_Window *w = (PetscSF_Window *)sf->data;
823   PetscSFDataLink link, next;
824   PetscSFWinLink  wlink, wnext;
825   PetscInt        i;
826   MPI_Comm        wcomm;
827   PetscBool       is_empty;
828 
829   PetscFunctionBegin;
830   for (link = w->link; link; link = next) {
831     next = link->next;
832     PetscCallMPI(MPI_Type_free(&link->unit));
833     for (i = 0; i < sf->nranks; i++) {
834       PetscCallMPI(MPI_Type_free(&link->mine[i]));
835       PetscCallMPI(MPI_Type_free(&link->remote[i]));
836     }
837     PetscCall(PetscFree2(link->mine, link->remote));
838     PetscCall(PetscFree(link));
839   }
840   w->link  = NULL;
841   wcomm    = w->window_comm;
842   is_empty = w->is_empty;
843   if (!is_empty) {
844     for (wlink = w->wins; wlink; wlink = wnext) {
845       wnext = wlink->next;
846       PetscCheck(!wlink->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Window still in use with address %p", (void *)wlink->addr);
847       PetscCall(PetscFree(wlink->dyn_target_addr));
848       PetscCall(PetscFree(wlink->reqs));
849       PetscCallMPI(MPI_Win_free(&wlink->win));
850       PetscCall(PetscFree(wlink));
851     }
852   }
853   w->wins = NULL;
854   PetscCall(PetscSFDestroy(&w->dynsf));
855   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
856   PetscCall(PetscCommDestroy(&w->window_comm));
857   PetscCall(PetscFree(w->wcommranks));
858   PetscFunctionReturn(PETSC_SUCCESS);
859 }
860 
861 static PetscErrorCode PetscSFRegisterPersistent_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
862 {
863   PetscSF_Window *w = (PetscSF_Window *)sf->data;
864   MPI_Aint        bytes, wsize;
865   PetscBool       is_empty;
866   PetscSFWinLink  link;
867 
868   PetscFunctionBegin;
869   PetscCall(PetscSFSetUp(sf));
870   if (w->flavor != PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscFunctionReturn(PETSC_SUCCESS);
871   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
872   wsize    = (MPI_Aint)(bytes * sf->nroots);
873   is_empty = w->is_empty;
874   if (is_empty) PetscFunctionReturn(PETSC_SUCCESS);
875   PetscCall(PetscNew(&link));
876   link->flavor = w->flavor;
877   link->next   = w->wins;
878 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
879   {
880     MPI_Comm wcomm = w->window_comm;
881     PetscCallMPI(MPI_Win_create_dynamic(w->info, wcomm, &link->win));
882   }
883 #endif
884   PetscCall(PetscSFWindowAttach(sf, link, (void *)rootdata, wsize));
885   link->rootdata   = (void *)rootdata;
886   link->leafdata   = (void *)leafdata;
887   link->bytes      = bytes;
888   link->epoch      = PETSC_FALSE;
889   link->inuse      = PETSC_FALSE;
890   link->persistent = PETSC_TRUE;
891   w->wins          = link;
892   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
893     PetscInt i;
894 
895     PetscCall(PetscMalloc1(sf->nranks, &link->reqs));
896     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
897   }
898   PetscFunctionReturn(PETSC_SUCCESS);
899 }
900 
901 static PetscErrorCode PetscSFDeregisterPersistent_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
902 {
903   PetscSF_Window *w = (PetscSF_Window *)sf->data;
904   MPI_Aint        bytes;
905   MPI_Comm        wcomm;
906   PetscBool       is_empty;
907   PetscSFWinLink *p;
908 
909   PetscFunctionBegin;
910   PetscCall(PetscSFSetUp(sf));
911   if (w->flavor != PETSCSF_WINDOW_FLAVOR_DYNAMIC) PetscFunctionReturn(PETSC_SUCCESS);
912   PetscCall(PetscSFGetDatatypeSize_Internal(PetscObjectComm((PetscObject)sf), unit, &bytes));
913   wcomm    = w->window_comm;
914   is_empty = w->is_empty;
915   if (is_empty) PetscFunctionReturn(PETSC_SUCCESS);
916   for (p = &w->wins; *p; p = &(*p)->next) {
917     PetscSFWinLink link = *p;
918     if (link->flavor == w->flavor && link->persistent && link->rootdata == rootdata && link->leafdata == leafdata && link->bytes == bytes) {
919       PetscCheck(!link->inuse, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Deregistering a window when communication is still in progress");
920       PetscCheck(!link->epoch, wcomm, PETSC_ERR_ARG_WRONGSTATE, "Deregistering a window with an unconcluded epoch");
921 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
922       PetscCallMPI(MPI_Win_detach(link->win, link->addr));
923       link->addr = NULL;
924 #endif
925       PetscCall(PetscFree(link->dyn_target_addr));
926       PetscCall(PetscFree(link->reqs));
927       PetscCallMPI(MPI_Win_free(&link->win));
928       *p = link->next;
929       PetscCall(PetscFree(link));
930       break;
931     }
932   }
933   PetscFunctionReturn(PETSC_SUCCESS);
934 }
935 
936 static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
937 {
938   PetscFunctionBegin;
939   PetscCall(PetscSFReset_Window(sf));
940   PetscCall(PetscFree(sf->data));
941   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", NULL));
942   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", NULL));
943   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", NULL));
944   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", NULL));
945   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", NULL));
946   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", NULL));
947   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFRegisterPersistent_C", NULL));
948   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFDeregisterPersistent_C", NULL));
949   PetscFunctionReturn(PETSC_SUCCESS);
950 }
951 
952 static PetscErrorCode PetscSFView_Window(PetscSF sf, PetscViewer viewer)
953 {
954   PetscSF_Window   *w = (PetscSF_Window *)sf->data;
955   PetscBool         isascii;
956   PetscViewerFormat format;
957 
958   PetscFunctionBegin;
959   PetscCall(PetscViewerGetFormat(viewer, &format));
960   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
961   if (isascii) {
962     PetscCall(PetscViewerASCIIPrintf(viewer, "  current flavor=%s synchronization=%s MultiSF sort=%s\n", PetscSFWindowFlavorTypes[w->flavor], PetscSFWindowSyncTypes[w->sync], sf->rankorder ? "rank-order" : "unordered"));
963     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
964       if (w->info != MPI_INFO_NULL) {
965         PetscMPIInt k, nkeys;
966         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
967 
968         PetscCallMPI(MPI_Info_get_nkeys(w->info, &nkeys));
969         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info with %d keys. Ordered key-value pairs follow:\n", nkeys));
970         for (k = 0; k < nkeys; k++) {
971           PetscMPIInt flag;
972 
973           PetscCallMPI(MPI_Info_get_nthkey(w->info, k, key));
974           PetscCallMPI(MPI_Info_get(w->info, key, MPI_MAX_INFO_VAL, value, &flag));
975           PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing key %s", key);
976           PetscCall(PetscViewerASCIIPrintf(viewer, "      %s = %s\n", key, value));
977         }
978       } else {
979         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info=MPI_INFO_NULL\n"));
980       }
981     }
982   }
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf, PetscSFDuplicateOption opt, PetscSF newsf)
987 {
988   PetscSF_Window       *w = (PetscSF_Window *)sf->data;
989   PetscSFWindowSyncType synctype;
990 
991   PetscFunctionBegin;
992   synctype = w->sync;
993   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
994   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
995   PetscCall(PetscSFWindowSetSyncType(newsf, synctype));
996   PetscCall(PetscSFWindowSetFlavorType(newsf, w->flavor));
997   PetscCall(PetscSFWindowSetInfo(newsf, w->info));
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1002 {
1003   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1004   PetscMPIInt         nranks;
1005   const PetscMPIInt  *ranks;
1006   const MPI_Aint     *target_disp;
1007   const MPI_Datatype *mine, *remote;
1008   MPI_Request        *reqs;
1009   MPI_Win             win;
1010 
1011   PetscFunctionBegin;
1012   PetscCheck(op == MPI_REPLACE, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PetscSFBcastBegin_Window with op!=MPI_REPLACE has not been implemented");
1013   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1014   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1015   PetscCall(PetscSFGetWindow(sf, unit, (void *)rootdata, leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT, 0, &target_disp, &reqs, &win));
1016   ranks = w->wcommranks;
1017   for (PetscMPIInt i = 0; i < nranks; i++) {
1018     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1019     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
1020       PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
1021 #if defined(PETSC_HAVE_MPI_RGET)
1022       PetscCallMPI(MPI_Rget(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win, &reqs[i]));
1023 #else
1024       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1025 #endif
1026     } else {
1027       CHKMEMQ;
1028       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1029       CHKMEMQ;
1030     }
1031   }
1032   PetscFunctionReturn(PETSC_SUCCESS);
1033 }
1034 
1035 static PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1036 {
1037   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1038   MPI_Win         win;
1039   MPI_Request    *reqs = NULL;
1040 
1041   PetscFunctionBegin;
1042   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1043   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1044   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
1045     PetscMPIInt        nranks;
1046     const PetscMPIInt *ranks;
1047 
1048     PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1049     ranks = w->wcommranks;
1050     for (PetscMPIInt i = 0; i < nranks; i++) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1051   }
1052   PetscCall(PetscSFRestoreWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED, PETSC_FALSE, &win));
1053   PetscFunctionReturn(PETSC_SUCCESS);
1054 }
1055 
1056 static PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1057 {
1058   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1059   PetscMPIInt         nranks;
1060   const PetscMPIInt  *ranks;
1061   const MPI_Aint     *target_disp;
1062   const MPI_Datatype *mine, *remote;
1063   MPI_Win             win;
1064 
1065   PetscFunctionBegin;
1066   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1067   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1068   PetscCall(PetscSFWindowOpTranslate(&op));
1069   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
1070   ranks = w->wcommranks;
1071   for (PetscMPIInt i = 0; i < nranks; i++) {
1072     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1073 
1074     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
1075     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1076     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1077   }
1078   PetscFunctionReturn(PETSC_SUCCESS);
1079 }
1080 
1081 static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1082 {
1083   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1084   MPI_Win         win;
1085   MPI_Request    *reqs = NULL;
1086 
1087   PetscFunctionBegin;
1088   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1089   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1090   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
1091   PetscFunctionReturn(PETSC_SUCCESS);
1092 }
1093 
1094 static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
1095 {
1096   PetscMPIInt         nranks;
1097   const PetscMPIInt  *ranks;
1098   const MPI_Datatype *mine, *remote;
1099   const MPI_Aint     *target_disp;
1100   MPI_Win             win;
1101   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
1102 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1103   PetscSFWindowFlavorType oldf;
1104 #endif
1105 
1106   PetscFunctionBegin;
1107   PetscCall(PetscSFGetRootRanks(sf, &nranks, NULL, NULL, NULL, NULL));
1108   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
1109   PetscCall(PetscSFWindowOpTranslate(&op));
1110 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1111   /* FetchAndOp without MPI_Get_Accumulate requires locking.
1112      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
1113   oldf      = w->flavor;
1114   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
1115   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, 0, 0, &target_disp, NULL, &win));
1116 #else
1117   PetscCall(PetscSFGetWindow(sf, unit, rootdata, (void *)leafdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
1118 #endif
1119   ranks = w->wcommranks;
1120   for (PetscMPIInt i = 0; i < nranks; i++) {
1121     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
1122 
1123 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1124     PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, ranks[i], 0, win));
1125     PetscCallMPI(MPI_Get(leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
1126     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1127     PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1128 #else
1129     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], 0, win));
1130     PetscCallMPI(MPI_Get_accumulate((void *)leafdata, 1, mine[i], leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
1131     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
1132 #endif
1133   }
1134 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1135   w->flavor = oldf;
1136 #endif
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1141 {
1142   MPI_Win win;
1143 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1144   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1145 #endif
1146   MPI_Request *reqs = NULL;
1147 
1148   PetscFunctionBegin;
1149   PetscCall(PetscSFFindWindow(sf, unit, rootdata, leafdata, &win, &reqs));
1150   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
1151 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
1152   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
1153 #else
1154   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, PETSC_TRUE, &win));
1155 #endif
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
1160 {
1161   PetscSF_Window *w = (PetscSF_Window *)sf->data;
1162 
1163   PetscFunctionBegin;
1164   sf->ops->SetUp           = PetscSFSetUp_Window;
1165   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
1166   sf->ops->Reset           = PetscSFReset_Window;
1167   sf->ops->Destroy         = PetscSFDestroy_Window;
1168   sf->ops->View            = PetscSFView_Window;
1169   sf->ops->Duplicate       = PetscSFDuplicate_Window;
1170   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
1171   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
1172   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
1173   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
1174   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
1175   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;
1176 
1177   PetscCall(PetscNew(&w));
1178   sf->data       = (void *)w;
1179   w->sync        = PETSCSF_WINDOW_SYNC_FENCE;
1180   w->flavor      = PETSCSF_WINDOW_FLAVOR_CREATE;
1181   w->info        = MPI_INFO_NULL;
1182   w->window_comm = MPI_COMM_NULL;
1183 
1184   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", PetscSFWindowSetSyncType_Window));
1185   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", PetscSFWindowGetSyncType_Window));
1186   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", PetscSFWindowSetFlavorType_Window));
1187   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", PetscSFWindowGetFlavorType_Window));
1188   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", PetscSFWindowSetInfo_Window));
1189   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", PetscSFWindowGetInfo_Window));
1190   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFRegisterPersistent_C", PetscSFRegisterPersistent_Window));
1191   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFDeregisterPersistent_C", PetscSFDeregisterPersistent_Window));
1192 
1193 #if defined(PETSC_HAVE_OPENMPI)
1194   #if PETSC_PKG_OPENMPI_VERSION_LE(1, 6, 0)
1195   {
1196     PetscBool ackbug = PETSC_FALSE;
1197 
1198     PetscCall(PetscOptionsGetBool(NULL, NULL, "-acknowledge_ompi_onesided_bug", &ackbug, NULL));
1199     PetscCheck(ackbug, PetscObjectComm((PetscObject)sf), PETSC_ERR_LIB, "Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
1200     PetscCall(PetscInfo(sf, "Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n"));
1201   }
1202   #endif
1203 #endif
1204   PetscFunctionReturn(PETSC_SUCCESS);
1205 }
1206