xref: /petsc/src/vec/is/sf/impls/window/sfwindow.c (revision 4ad8454beace47809662cdae21ee081016eaa39a)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 
3 typedef struct _n_PetscSFDataLink *PetscSFDataLink;
4 typedef struct _n_PetscSFWinLink  *PetscSFWinLink;
5 
6 typedef struct {
7   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
8   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
9   PetscSFWinLink          wins;   /* List of active windows */
10   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
11   PetscSF                 dynsf;
12   MPI_Info                info;
13 } PetscSF_Window;
14 
15 struct _n_PetscSFDataLink {
16   MPI_Datatype    unit;
17   MPI_Datatype   *mine;
18   MPI_Datatype   *remote;
19   PetscSFDataLink next;
20 };
21 
22 struct _n_PetscSFWinLink {
23   PetscBool               inuse;
24   size_t                  bytes;
25   void                   *addr;
26   void                   *paddr;
27   MPI_Win                 win;
28   MPI_Request            *reqs;
29   PetscSFWindowFlavorType flavor;
30   MPI_Aint               *dyn_target_addr;
31   PetscBool               epoch;
32   PetscSFWinLink          next;
33 };
34 
35 const char *const PetscSFWindowSyncTypes[]   = {"FENCE", "LOCK", "ACTIVE", "PetscSFWindowSyncType", "PETSCSF_WINDOW_SYNC_", NULL};
36 const char *const PetscSFWindowFlavorTypes[] = {"CREATE", "DYNAMIC", "ALLOCATE", "SHARED", "PetscSFWindowFlavorType", "PETSCSF_WINDOW_FLAVOR_", NULL};
37 
38 /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
39 static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
40 {
41   PetscFunctionBegin;
42   if (*op == MPIU_SUM) *op = MPI_SUM;
43   else if (*op == MPIU_MAX) *op = MPI_MAX;
44   else if (*op == MPIU_MIN) *op = MPI_MIN;
45   PetscFunctionReturn(PETSC_SUCCESS);
46 }
47 
48 /*
49    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank
50 
51    Not Collective
52 
53    Input Parameters:
54 +  sf - star forest of type `PETSCSFWINDOW`
55 -  unit - data type for each node
56 
57    Output Parameters:
58 +  localtypes - types describing part of local leaf buffer referencing each remote rank
59 -  remotetypes - types describing part of remote root buffer referenced for each remote rank
60 
61    Level: developer
62 
63 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetGraph()`, `PetscSFView()`
64 @*/
65 static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf, MPI_Datatype unit, const MPI_Datatype **localtypes, const MPI_Datatype **remotetypes)
66 {
67   PetscSF_Window    *w = (PetscSF_Window *)sf->data;
68   PetscSFDataLink    link;
69   PetscInt           i, nranks;
70   const PetscInt    *roffset, *rmine, *rremote;
71   const PetscMPIInt *ranks;
72 
73   PetscFunctionBegin;
74   /* Look for types in cache */
75   for (link = w->link; link; link = link->next) {
76     PetscBool match;
77     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
78     if (match) {
79       *localtypes  = link->mine;
80       *remotetypes = link->remote;
81       PetscFunctionReturn(PETSC_SUCCESS);
82     }
83   }
84 
85   /* Create new composite types for each send rank */
86   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, &roffset, &rmine, &rremote));
87   PetscCall(PetscNew(&link));
88   PetscCallMPI(MPI_Type_dup(unit, &link->unit));
89   PetscCall(PetscMalloc2(nranks, &link->mine, nranks, &link->remote));
90   for (i = 0; i < nranks; i++) {
91     PetscInt     rcount = roffset[i + 1] - roffset[i];
92     PetscMPIInt *rmine, *rremote;
93 #if !defined(PETSC_USE_64BIT_INDICES)
94     rmine   = sf->rmine + sf->roffset[i];
95     rremote = sf->rremote + sf->roffset[i];
96 #else
97     PetscInt j;
98     PetscCall(PetscMalloc2(rcount, &rmine, rcount, &rremote));
99     for (j = 0; j < rcount; j++) {
100       PetscCall(PetscMPIIntCast(sf->rmine[sf->roffset[i] + j], rmine + j));
101       PetscCall(PetscMPIIntCast(sf->rremote[sf->roffset[i] + j], rremote + j));
102     }
103 #endif
104 
105     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rmine, link->unit, &link->mine[i]));
106     PetscCallMPI(MPI_Type_create_indexed_block(rcount, 1, rremote, link->unit, &link->remote[i]));
107 #if defined(PETSC_USE_64BIT_INDICES)
108     PetscCall(PetscFree2(rmine, rremote));
109 #endif
110     PetscCallMPI(MPI_Type_commit(&link->mine[i]));
111     PetscCallMPI(MPI_Type_commit(&link->remote[i]));
112   }
113   link->next = w->link;
114   w->link    = link;
115 
116   *localtypes  = link->mine;
117   *remotetypes = link->remote;
118   PetscFunctionReturn(PETSC_SUCCESS);
119 }
120 
121 /*@C
122   PetscSFWindowSetFlavorType - Set flavor type for `MPI_Win` creation
123 
124   Logically Collective
125 
126   Input Parameters:
127 + sf     - star forest for communication of type `PETSCSFWINDOW`
128 - flavor - flavor type
129 
130   Options Database Key:
131 . -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see `PetscSFWindowFlavorType`)
132 
133   Level: advanced
134 
135   Notes:
136   Windows reuse follows these rules\:
137 .vb
138      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create
139 
140      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
141        for i=1 to K
142          PetscSFOperationBegin(rootdata1,leafdata_whatever);
143          PetscSFOperationEnd(rootdata1,leafdata_whatever);
144          ...
145          PetscSFOperationBegin(rootdataN,leafdata_whatever);
146          PetscSFOperationEnd(rootdataN,leafdata_whatever);
147        endfor
148        The following pattern will instead raise an error
149          PetscSFOperationBegin(rootdata1,leafdata_whatever);
150          PetscSFOperationEnd(rootdata1,leafdata_whatever);
151          PetscSFOperationBegin(rank ? rootdata1 : rootdata2,leafdata_whatever);
152          PetscSFOperationEnd(rank ? rootdata1 : rootdata2,leafdata_whatever);
153 
154      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use
155 
156      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE
157 .ve
158 
159 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetFlavorType()`
160 @*/
161 PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf, PetscSFWindowFlavorType flavor)
162 {
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
165   PetscValidLogicalCollectiveEnum(sf, flavor, 2);
166   PetscTryMethod(sf, "PetscSFWindowSetFlavorType_C", (PetscSF, PetscSFWindowFlavorType), (sf, flavor));
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType flavor)
171 {
172   PetscSF_Window *w = (PetscSF_Window *)sf->data;
173 
174   PetscFunctionBegin;
175   w->flavor = flavor;
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 /*@C
180   PetscSFWindowGetFlavorType - Get  `PETSCSFWINDOW` flavor type for `PetscSF` communication
181 
182   Logically Collective
183 
184   Input Parameter:
185 . sf - star forest for communication of type `PETSCSFWINDOW`
186 
187   Output Parameter:
188 . flavor - flavor type
189 
190   Level: advanced
191 
192 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetFlavorType()`
193 @*/
194 PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf, PetscSFWindowFlavorType *flavor)
195 {
196   PetscFunctionBegin;
197   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
198   PetscAssertPointer(flavor, 2);
199   PetscUseMethod(sf, "PetscSFWindowGetFlavorType_C", (PetscSF, PetscSFWindowFlavorType *), (sf, flavor));
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType *flavor)
204 {
205   PetscSF_Window *w = (PetscSF_Window *)sf->data;
206 
207   PetscFunctionBegin;
208   *flavor = w->flavor;
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 /*@C
213   PetscSFWindowSetSyncType - Set synchronization type for `PetscSF` communication of type  `PETSCSFWINDOW`
214 
215   Logically Collective
216 
217   Input Parameters:
218 + sf   - star forest for communication
219 - sync - synchronization type
220 
221   Options Database Key:
222 . -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see `PetscSFWindowSyncType`)
223 
224   Level: advanced
225 
226 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetSyncType()`, `PetscSFWindowSyncType`
227 @*/
228 PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf, PetscSFWindowSyncType sync)
229 {
230   PetscFunctionBegin;
231   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
232   PetscValidLogicalCollectiveEnum(sf, sync, 2);
233   PetscTryMethod(sf, "PetscSFWindowSetSyncType_C", (PetscSF, PetscSFWindowSyncType), (sf, sync));
234   PetscFunctionReturn(PETSC_SUCCESS);
235 }
236 
237 static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf, PetscSFWindowSyncType sync)
238 {
239   PetscSF_Window *w = (PetscSF_Window *)sf->data;
240 
241   PetscFunctionBegin;
242   w->sync = sync;
243   PetscFunctionReturn(PETSC_SUCCESS);
244 }
245 
246 /*@C
247   PetscSFWindowGetSyncType - Get synchronization type for `PetscSF` communication of type `PETSCSFWINDOW`
248 
249   Logically Collective
250 
251   Input Parameter:
252 . sf - star forest for communication
253 
254   Output Parameter:
255 . sync - synchronization type
256 
257   Level: advanced
258 
259 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetSyncType()`, `PetscSFWindowSyncType`
260 @*/
261 PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf, PetscSFWindowSyncType *sync)
262 {
263   PetscFunctionBegin;
264   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
265   PetscAssertPointer(sync, 2);
266   PetscUseMethod(sf, "PetscSFWindowGetSyncType_C", (PetscSF, PetscSFWindowSyncType *), (sf, sync));
267   PetscFunctionReturn(PETSC_SUCCESS);
268 }
269 
270 static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf, PetscSFWindowSyncType *sync)
271 {
272   PetscSF_Window *w = (PetscSF_Window *)sf->data;
273 
274   PetscFunctionBegin;
275   *sync = w->sync;
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 /*@C
280   PetscSFWindowSetInfo - Set the `MPI_Info` handle that will be used for subsequent windows allocation
281 
282   Logically Collective
283 
284   Input Parameters:
285 + sf   - star forest for communication
286 - info - `MPI_Info` handle
287 
288   Level: advanced
289 
290   Note:
291   The info handle is duplicated with a call to `MPI_Info_dup()` unless info = `MPI_INFO_NULL`.
292 
293 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowGetInfo()`
294 @*/
295 PetscErrorCode PetscSFWindowSetInfo(PetscSF sf, MPI_Info info)
296 {
297   PetscFunctionBegin;
298   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
299   PetscTryMethod(sf, "PetscSFWindowSetInfo_C", (PetscSF, MPI_Info), (sf, info));
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf, MPI_Info info)
304 {
305   PetscSF_Window *w = (PetscSF_Window *)sf->data;
306 
307   PetscFunctionBegin;
308   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
309   if (info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_dup(info, &w->info));
310   PetscFunctionReturn(PETSC_SUCCESS);
311 }
312 
313 /*@C
314   PetscSFWindowGetInfo - Get the `MPI_Info` handle used for windows allocation
315 
316   Logically Collective
317 
318   Input Parameter:
319 . sf - star forest for communication
320 
321   Output Parameter:
322 . info - `MPI_Info` handle
323 
324   Level: advanced
325 
326   Note:
327   If `PetscSFWindowSetInfo()` has not be called, this returns `MPI_INFO_NULL`
328 
329 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFSetFromOptions()`, `PetscSFWindowSetInfo()`
330 @*/
331 PetscErrorCode PetscSFWindowGetInfo(PetscSF sf, MPI_Info *info)
332 {
333   PetscFunctionBegin;
334   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
335   PetscAssertPointer(info, 2);
336   PetscUseMethod(sf, "PetscSFWindowGetInfo_C", (PetscSF, MPI_Info *), (sf, info));
337   PetscFunctionReturn(PETSC_SUCCESS);
338 }
339 
340 static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf, MPI_Info *info)
341 {
342   PetscSF_Window *w = (PetscSF_Window *)sf->data;
343 
344   PetscFunctionBegin;
345   *info = w->info;
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 /*
350    PetscSFGetWindow - Get a window for use with a given data type
351 
352    Collective
353 
354    Input Parameters:
355 +  sf - star forest
356 .  unit - data type
357 .  array - array to be sent
358 .  sync - type of synchronization `PetscSFWindowSyncType`
359 .  epoch - `PETSC_TRUE` to acquire the window and start an epoch, `PETSC_FALSE` to just acquire the window
360 .  fenceassert - assert parameter for call to `MPI_Win_fence()`, if sync == `PETSCSF_WINDOW_SYNC_FENCE`
361 .  postassert - assert parameter for call to `MPI_Win_post()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
362 -  startassert - assert parameter for call to `MPI_Win_start()`, if sync == `PETSCSF_WINDOW_SYNC_ACTIVE`
363 
364    Output Parameters:
365 +  target_disp - target_disp argument for RMA calls (significative for `PETSCSF_WINDOW_FLAVOR_DYNAMIC` only)
366 +  reqs - array of requests (significative for sync == `PETSCSF_WINDOW_SYNC_LOCK` only)
367 -  win - window
368 
369    Level: developer
370 
371 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetRootRanks()`, `PetscSFWindowGetDataTypes()`
372 */
373 static PetscErrorCode PetscSFGetWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscMPIInt postassert, PetscMPIInt startassert, const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
374 {
375   PetscSF_Window *w = (PetscSF_Window *)sf->data;
376   MPI_Aint        lb, lb_true, bytes, bytes_true;
377   PetscSFWinLink  link;
378 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
379   MPI_Aint winaddr;
380   PetscInt nranks;
381 #endif
382   PetscBool reuse = PETSC_FALSE, update = PETSC_FALSE;
383   PetscBool dummy[2];
384   MPI_Aint  wsize;
385 
386   PetscFunctionBegin;
387   PetscCallMPI(MPI_Type_get_extent(unit, &lb, &bytes));
388   PetscCallMPI(MPI_Type_get_true_extent(unit, &lb_true, &bytes_true));
389   PetscCheck(lb == 0 && lb_true == 0, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for unit type with nonzero lower bound, write petsc-maint@mcs.anl.gov if you want this feature");
390   PetscCheck(bytes == bytes_true, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for unit type with modified extent, write petsc-maint@mcs.anl.gov if you want this feature");
391   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
392   for (link = w->wins; reuse && link; link = link->next) {
393     PetscBool winok = PETSC_FALSE;
394     if (w->flavor != link->flavor) continue;
395     switch (w->flavor) {
396     case PETSCSF_WINDOW_FLAVOR_DYNAMIC: /* check available matching array, error if in use (we additionally check that the matching condition is the same across processes) */
397       if (array == link->addr) {
398         if (PetscDefined(USE_DEBUG)) {
399           dummy[0] = PETSC_TRUE;
400           dummy[1] = PETSC_TRUE;
401           PetscCall(MPIU_Allreduce(MPI_IN_PLACE, dummy, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)sf)));
402           PetscCall(MPIU_Allreduce(MPI_IN_PLACE, dummy + 1, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)sf)));
403           PetscCheck(dummy[0] == dummy[1], PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
404         }
405         PetscCheck(!link->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Window in use");
406         PetscCheck(!epoch || !link->epoch, PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Window epoch not finished");
407         winok       = PETSC_TRUE;
408         link->paddr = array;
409       } else if (PetscDefined(USE_DEBUG)) {
410         dummy[0] = PETSC_FALSE;
411         dummy[1] = PETSC_FALSE;
412         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, dummy, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)sf)));
413         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, dummy + 1, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)sf)));
414         PetscCheck(dummy[0] == dummy[1], PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
415       }
416       break;
417     case PETSCSF_WINDOW_FLAVOR_ALLOCATE: /* check available by matching size, allocate if in use */
418     case PETSCSF_WINDOW_FLAVOR_SHARED:
419       if (!link->inuse && bytes == (MPI_Aint)link->bytes) {
420         update      = PETSC_TRUE;
421         link->paddr = array;
422         winok       = PETSC_TRUE;
423       }
424       break;
425     default:
426       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
427     }
428     if (winok) {
429       *win = link->win;
430       PetscCall(PetscInfo(sf, "Reusing window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)PetscObjectComm((PetscObject)sf)));
431       goto found;
432     }
433   }
434 
435   wsize = (MPI_Aint)bytes * sf->nroots;
436   PetscCall(PetscNew(&link));
437   link->bytes           = bytes;
438   link->next            = w->wins;
439   link->flavor          = w->flavor;
440   link->dyn_target_addr = NULL;
441   link->reqs            = NULL;
442   w->wins               = link;
443   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
444     PetscInt i;
445 
446     PetscCall(PetscMalloc1(sf->nranks, &link->reqs));
447     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
448   }
449   switch (w->flavor) {
450   case PETSCSF_WINDOW_FLAVOR_CREATE:
451     PetscCallMPI(MPI_Win_create(array, wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->win));
452     link->addr  = array;
453     link->paddr = array;
454     break;
455 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
456   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
457     PetscCallMPI(MPI_Win_create_dynamic(w->info, PetscObjectComm((PetscObject)sf), &link->win));
458   #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) /* some Open MPI versions do not support MPI_Win_attach(win,NULL,0); */
459     PetscCallMPI(MPI_Win_attach(link->win, wsize ? array : (void *)dummy, wsize));
460   #else
461     PetscCallMPI(MPI_Win_attach(link->win, array, wsize));
462   #endif
463     link->addr  = array;
464     link->paddr = array;
465     PetscCheck(w->dynsf, PetscObjectComm((PetscObject)sf), PETSC_ERR_ORDER, "Must call PetscSFSetUp()");
466     PetscCall(PetscSFSetUp(w->dynsf));
467     PetscCall(PetscSFGetRootRanks(w->dynsf, &nranks, NULL, NULL, NULL, NULL));
468     PetscCall(PetscMalloc1(nranks, &link->dyn_target_addr));
469     PetscCallMPI(MPI_Get_address(array, &winaddr));
470     PetscCall(PetscSFBcastBegin(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
471     PetscCall(PetscSFBcastEnd(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE));
472     break;
473   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
474     PetscCallMPI(MPI_Win_allocate(wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->addr, &link->win));
475     update      = PETSC_TRUE;
476     link->paddr = array;
477     break;
478 #endif
479 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
480   case PETSCSF_WINDOW_FLAVOR_SHARED:
481     PetscCallMPI(MPI_Win_allocate_shared(wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->addr, &link->win));
482     update      = PETSC_TRUE;
483     link->paddr = array;
484     break;
485 #endif
486   default:
487     SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
488   }
489   PetscCall(PetscInfo(sf, "New window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)PetscObjectComm((PetscObject)sf)));
490   *win = link->win;
491 
492 found:
493 
494   if (target_disp) *target_disp = link->dyn_target_addr;
495   if (reqs) *reqs = link->reqs;
496   if (update) { /* locks are needed for the "separate" memory model only, the fence guaranties memory-synchronization */
497     PetscMPIInt rank;
498 
499     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
500     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, MPI_MODE_NOCHECK, *win));
501     PetscCall(PetscMemcpy(link->addr, array, sf->nroots * bytes));
502     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
503       PetscCallMPI(MPI_Win_unlock(rank, *win));
504       PetscCallMPI(MPI_Win_fence(0, *win));
505     }
506   }
507   link->inuse = PETSC_TRUE;
508   link->epoch = epoch;
509   if (epoch) {
510     switch (sync) {
511     case PETSCSF_WINDOW_SYNC_FENCE:
512       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
513       break;
514     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
515       break;
516     case PETSCSF_WINDOW_SYNC_ACTIVE: {
517       MPI_Group   ingroup, outgroup;
518       PetscMPIInt isize, osize;
519 
520       /* Open MPI 4.0.2 with btl=vader does not like calling
521          - MPI_Win_complete when ogroup is empty
522          - MPI_Win_wait when igroup is empty
523          So, we do not even issue the corresponding start and post calls
524          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
525          start(outgroup) has a matching post(ingroup)
526          and this is guaranteed by PetscSF
527       */
528       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
529       PetscCallMPI(MPI_Group_size(ingroup, &isize));
530       PetscCallMPI(MPI_Group_size(outgroup, &osize));
531       if (isize) PetscCallMPI(MPI_Win_post(ingroup, postassert, *win));
532       if (osize) PetscCallMPI(MPI_Win_start(outgroup, startassert, *win));
533     } break;
534     default:
535       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Unknown synchronization type");
536     }
537   }
538   PetscFunctionReturn(PETSC_SUCCESS);
539 }
540 
541 /*
542    PetscSFFindWindow - Finds a window that is already in use
543 
544    Not Collective
545 
546    Input Parameters:
547 +  sf - star forest
548 .  unit - data type
549 -  array - array with which the window is associated
550 
551    Output Parameters:
552 +  win - window
553 -  reqs - outstanding requests associated to the window
554 
555    Level: developer
556 
557 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
558 */
559 static PetscErrorCode PetscSFFindWindow(PetscSF sf, MPI_Datatype unit, const void *array, MPI_Win *win, MPI_Request **reqs)
560 {
561   PetscSF_Window *w = (PetscSF_Window *)sf->data;
562   PetscSFWinLink  link;
563 
564   PetscFunctionBegin;
565   *win = MPI_WIN_NULL;
566   for (link = w->wins; link; link = link->next) {
567     if (array == link->paddr) {
568       PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)PetscObjectComm((PetscObject)sf)));
569       *win  = link->win;
570       *reqs = link->reqs;
571       PetscFunctionReturn(PETSC_SUCCESS);
572     }
573   }
574   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
575 }
576 
577 /*
578    PetscSFRestoreWindow - Restores a window obtained with `PetscSFGetWindow()`
579 
580    Collective
581 
582    Input Parameters:
583 +  sf - star forest
584 .  unit - data type
585 .  array - array associated with window
586 .  sync - type of synchronization `PetscSFWindowSyncType`
587 .  epoch - close an epoch, must match argument to `PetscSFGetWindow()`
588 .  update - if we have to update the local window array
589 -  win - window
590 
591    Level: developer
592 
593 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFFindWindow()`
594 */
595 static PetscErrorCode PetscSFRestoreWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscBool update, MPI_Win *win)
596 {
597   PetscSF_Window         *w = (PetscSF_Window *)sf->data;
598   PetscSFWinLink         *p, link;
599   PetscBool               reuse = PETSC_FALSE;
600   PetscSFWindowFlavorType flavor;
601   void                   *laddr;
602   size_t                  bytes;
603 
604   PetscFunctionBegin;
605   for (p = &w->wins; *p; p = &(*p)->next) {
606     link = *p;
607     if (*win == link->win) {
608       PetscCheck(array == link->paddr, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Matched window, but not array");
609       if (epoch != link->epoch) {
610         PetscCheck(!epoch, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "No epoch to end");
611         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Restoring window without ending epoch");
612       }
613       laddr  = link->addr;
614       flavor = link->flavor;
615       bytes  = link->bytes;
616       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
617       else {
618         *p     = link->next;
619         update = PETSC_FALSE;
620       } /* remove from list */
621       goto found;
622     }
623   }
624   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
625 
626 found:
627   PetscCall(PetscInfo(sf, "Window %" PETSC_INTPTR_T_FMT " of flavor %d for comm %" PETSC_INTPTR_T_FMT "\n", (PETSC_INTPTR_T)link->win, link->flavor, (PETSC_INTPTR_T)PetscObjectComm((PetscObject)sf)));
628   if (epoch) {
629     switch (sync) {
630     case PETSCSF_WINDOW_SYNC_FENCE:
631       PetscCallMPI(MPI_Win_fence(fenceassert, *win));
632       break;
633     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
634       break;
635     case PETSCSF_WINDOW_SYNC_ACTIVE: {
636       MPI_Group   ingroup, outgroup;
637       PetscMPIInt isize, osize;
638 
639       /* Open MPI 4.0.2 with btl=wader does not like calling
640          - MPI_Win_complete when ogroup is empty
641          - MPI_Win_wait when igroup is empty
642          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
643          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
644          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
645       */
646       PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
647       PetscCallMPI(MPI_Group_size(ingroup, &isize));
648       PetscCallMPI(MPI_Group_size(outgroup, &osize));
649       if (osize) PetscCallMPI(MPI_Win_complete(*win));
650       if (isize) PetscCallMPI(MPI_Win_wait(*win));
651     } break;
652     default:
653       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Unknown synchronization type");
654     }
655   }
656   if (update) {
657     if (sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, *win));
658     PetscCall(PetscMemcpy(array, laddr, sf->nroots * bytes));
659   }
660   link->epoch = PETSC_FALSE;
661   link->inuse = PETSC_FALSE;
662   link->paddr = NULL;
663   if (!reuse) {
664     PetscCall(PetscFree(link->dyn_target_addr));
665     PetscCall(PetscFree(link->reqs));
666     PetscCallMPI(MPI_Win_free(&link->win));
667     PetscCall(PetscFree(link));
668     *win = MPI_WIN_NULL;
669   }
670   PetscFunctionReturn(PETSC_SUCCESS);
671 }
672 
673 static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
674 {
675   PetscSF_Window *w = (PetscSF_Window *)sf->data;
676   MPI_Group       ingroup, outgroup;
677 
678   PetscFunctionBegin;
679   PetscCall(PetscSFSetUpRanks(sf, MPI_GROUP_EMPTY));
680   if (!w->dynsf) {
681     PetscInt     i;
682     PetscSFNode *remotes;
683 
684     PetscCall(PetscMalloc1(sf->nranks, &remotes));
685     for (i = 0; i < sf->nranks; i++) {
686       remotes[i].rank  = sf->ranks[i];
687       remotes[i].index = 0;
688     }
689     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &w->dynsf));
690     PetscCall(PetscSFWindowSetFlavorType(w->dynsf, PETSCSF_WINDOW_FLAVOR_CREATE)); /* break recursion */
691     PetscCall(PetscSFSetGraph(w->dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER));
692   }
693   switch (w->sync) {
694   case PETSCSF_WINDOW_SYNC_ACTIVE:
695     PetscCall(PetscSFGetGroups(sf, &ingroup, &outgroup));
696   default:
697     break;
698   }
699   PetscFunctionReturn(PETSC_SUCCESS);
700 }
701 
702 static PetscErrorCode PetscSFSetFromOptions_Window(PetscSF sf, PetscOptionItems *PetscOptionsObject)
703 {
704   PetscSF_Window         *w      = (PetscSF_Window *)sf->data;
705   PetscSFWindowFlavorType flavor = w->flavor;
706 
707   PetscFunctionBegin;
708   PetscOptionsHeadBegin(PetscOptionsObject, "PetscSF Window options");
709   PetscCall(PetscOptionsEnum("-sf_window_sync", "synchronization type to use for PetscSF Window communication", "PetscSFWindowSetSyncType", PetscSFWindowSyncTypes, (PetscEnum)w->sync, (PetscEnum *)&w->sync, NULL));
710   PetscCall(PetscOptionsEnum("-sf_window_flavor", "flavor to use for PetscSF Window creation", "PetscSFWindowSetFlavorType", PetscSFWindowFlavorTypes, (PetscEnum)flavor, (PetscEnum *)&flavor, NULL));
711   PetscCall(PetscSFWindowSetFlavorType(sf, flavor));
712   PetscOptionsHeadEnd();
713   PetscFunctionReturn(PETSC_SUCCESS);
714 }
715 
716 static PetscErrorCode PetscSFReset_Window(PetscSF sf)
717 {
718   PetscSF_Window *w = (PetscSF_Window *)sf->data;
719   PetscSFDataLink link, next;
720   PetscSFWinLink  wlink, wnext;
721   PetscInt        i;
722 
723   PetscFunctionBegin;
724   for (link = w->link; link; link = next) {
725     next = link->next;
726     PetscCallMPI(MPI_Type_free(&link->unit));
727     for (i = 0; i < sf->nranks; i++) {
728       PetscCallMPI(MPI_Type_free(&link->mine[i]));
729       PetscCallMPI(MPI_Type_free(&link->remote[i]));
730     }
731     PetscCall(PetscFree2(link->mine, link->remote));
732     PetscCall(PetscFree(link));
733   }
734   w->link = NULL;
735   for (wlink = w->wins; wlink; wlink = wnext) {
736     wnext = wlink->next;
737     PetscCheck(!wlink->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Window still in use with address %p", (void *)wlink->addr);
738     PetscCall(PetscFree(wlink->dyn_target_addr));
739     PetscCall(PetscFree(wlink->reqs));
740     PetscCallMPI(MPI_Win_free(&wlink->win));
741     PetscCall(PetscFree(wlink));
742   }
743   w->wins = NULL;
744   PetscCall(PetscSFDestroy(&w->dynsf));
745   if (w->info != MPI_INFO_NULL) PetscCallMPI(MPI_Info_free(&w->info));
746   PetscFunctionReturn(PETSC_SUCCESS);
747 }
748 
749 static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
750 {
751   PetscFunctionBegin;
752   PetscCall(PetscSFReset_Window(sf));
753   PetscCall(PetscFree(sf->data));
754   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", NULL));
755   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", NULL));
756   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", NULL));
757   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", NULL));
758   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", NULL));
759   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", NULL));
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
763 static PetscErrorCode PetscSFView_Window(PetscSF sf, PetscViewer viewer)
764 {
765   PetscSF_Window   *w = (PetscSF_Window *)sf->data;
766   PetscBool         iascii;
767   PetscViewerFormat format;
768 
769   PetscFunctionBegin;
770   PetscCall(PetscViewerGetFormat(viewer, &format));
771   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
772   if (iascii) {
773     PetscCall(PetscViewerASCIIPrintf(viewer, "  current flavor=%s synchronization=%s MultiSF sort=%s\n", PetscSFWindowFlavorTypes[w->flavor], PetscSFWindowSyncTypes[w->sync], sf->rankorder ? "rank-order" : "unordered"));
774     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
775       if (w->info != MPI_INFO_NULL) {
776         PetscMPIInt k, nkeys;
777         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
778 
779         PetscCallMPI(MPI_Info_get_nkeys(w->info, &nkeys));
780         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info with %d keys. Ordered key-value pairs follow:\n", nkeys));
781         for (k = 0; k < nkeys; k++) {
782           PetscMPIInt flag;
783 
784           PetscCallMPI(MPI_Info_get_nthkey(w->info, k, key));
785           PetscCallMPI(MPI_Info_get(w->info, key, MPI_MAX_INFO_VAL, value, &flag));
786           PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing key %s", key);
787           PetscCall(PetscViewerASCIIPrintf(viewer, "      %s = %s\n", key, value));
788         }
789       } else {
790         PetscCall(PetscViewerASCIIPrintf(viewer, "    current info=MPI_INFO_NULL\n"));
791       }
792     }
793   }
794   PetscFunctionReturn(PETSC_SUCCESS);
795 }
796 
797 static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf, PetscSFDuplicateOption opt, PetscSF newsf)
798 {
799   PetscSF_Window       *w = (PetscSF_Window *)sf->data;
800   PetscSFWindowSyncType synctype;
801 
802   PetscFunctionBegin;
803   synctype = w->sync;
804   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
805   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
806   PetscCall(PetscSFWindowSetSyncType(newsf, synctype));
807   PetscCall(PetscSFWindowSetFlavorType(newsf, w->flavor));
808   PetscCall(PetscSFWindowSetInfo(newsf, w->info));
809   PetscFunctionReturn(PETSC_SUCCESS);
810 }
811 
812 static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
813 {
814   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
815   PetscInt            i, nranks;
816   const PetscMPIInt  *ranks;
817   const MPI_Aint     *target_disp;
818   const MPI_Datatype *mine, *remote;
819   MPI_Request        *reqs;
820   MPI_Win             win;
821 
822   PetscFunctionBegin;
823   PetscCheck(op == MPI_REPLACE, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PetscSFBcastBegin_Window with op!=MPI_REPLACE has not been implemented");
824   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
825   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
826   PetscCall(PetscSFGetWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT, 0, &target_disp, &reqs, &win));
827   for (i = 0; i < nranks; i++) {
828     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
829 
830     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
831       PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
832 #if defined(PETSC_HAVE_MPI_RGET)
833       PetscCallMPI(MPI_Rget(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win, &reqs[i]));
834 #else
835       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
836 #endif
837     } else {
838       PetscCallMPI(MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
839     }
840   }
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
845 {
846   PetscSF_Window *w = (PetscSF_Window *)sf->data;
847   MPI_Win         win;
848   MPI_Request    *reqs = NULL;
849 
850   PetscFunctionBegin;
851   PetscCall(PetscSFFindWindow(sf, unit, rootdata, &win, &reqs));
852   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
853   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
854     PetscInt           i, nranks;
855     const PetscMPIInt *ranks;
856 
857     PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
858     for (i = 0; i < nranks; i++) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
859   }
860   PetscCall(PetscSFRestoreWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED, PETSC_FALSE, &win));
861   PetscFunctionReturn(PETSC_SUCCESS);
862 }
863 
864 static PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
865 {
866   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
867   PetscInt            i, nranks;
868   const PetscMPIInt  *ranks;
869   const MPI_Aint     *target_disp;
870   const MPI_Datatype *mine, *remote;
871   MPI_Win             win;
872 
873   PetscFunctionBegin;
874   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
875   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
876   PetscCall(PetscSFWindowOpTranslate(&op));
877   PetscCall(PetscSFGetWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
878   for (i = 0; i < nranks; i++) {
879     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
880 
881     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win));
882     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
883     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
884   }
885   PetscFunctionReturn(PETSC_SUCCESS);
886 }
887 
888 static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
889 {
890   PetscSF_Window *w = (PetscSF_Window *)sf->data;
891   MPI_Win         win;
892   MPI_Request    *reqs = NULL;
893 
894   PetscFunctionBegin;
895   PetscCall(PetscSFFindWindow(sf, unit, rootdata, &win, &reqs));
896   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
897   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
898   PetscFunctionReturn(PETSC_SUCCESS);
899 }
900 
901 static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
902 {
903   PetscInt            i, nranks;
904   const PetscMPIInt  *ranks;
905   const MPI_Datatype *mine, *remote;
906   const MPI_Aint     *target_disp;
907   MPI_Win             win;
908   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
909 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
910   PetscSFWindowFlavorType oldf;
911 #endif
912 
913   PetscFunctionBegin;
914   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
915   PetscCall(PetscSFWindowGetDataTypes(sf, unit, &mine, &remote));
916   PetscCall(PetscSFWindowOpTranslate(&op));
917 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
918   /* FetchAndOp without MPI_Get_Accumulate requires locking.
919      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
920   oldf      = w->flavor;
921   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
922   PetscCall(PetscSFGetWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, 0, 0, &target_disp, NULL, &win));
923 #else
924   PetscCall(PetscSFGetWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win));
925 #endif
926   for (i = 0; i < nranks; i++) {
927     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
928 
929 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
930     PetscCallMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, ranks[i], 0, win));
931     PetscCallMPI(MPI_Get(leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], win));
932     PetscCallMPI(MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
933     PetscCallMPI(MPI_Win_unlock(ranks[i], win));
934 #else
935     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], 0, win));
936     PetscCallMPI(MPI_Get_accumulate((void *)leafdata, 1, mine[i], leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win));
937     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) PetscCallMPI(MPI_Win_unlock(ranks[i], win));
938 #endif
939   }
940 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
941   w->flavor = oldf;
942 #endif
943   PetscFunctionReturn(PETSC_SUCCESS);
944 }
945 
946 static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
947 {
948   MPI_Win win;
949 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
950   PetscSF_Window *w = (PetscSF_Window *)sf->data;
951 #endif
952   MPI_Request *reqs = NULL;
953 
954   PetscFunctionBegin;
955   PetscCall(PetscSFFindWindow(sf, unit, rootdata, &win, &reqs));
956   if (reqs) PetscCallMPI(MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE));
957 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
958   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win));
959 #else
960   PetscCall(PetscSFRestoreWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, PETSC_TRUE, &win));
961 #endif
962   PetscFunctionReturn(PETSC_SUCCESS);
963 }
964 
965 PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
966 {
967   PetscSF_Window *w = (PetscSF_Window *)sf->data;
968 
969   PetscFunctionBegin;
970   sf->ops->SetUp           = PetscSFSetUp_Window;
971   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
972   sf->ops->Reset           = PetscSFReset_Window;
973   sf->ops->Destroy         = PetscSFDestroy_Window;
974   sf->ops->View            = PetscSFView_Window;
975   sf->ops->Duplicate       = PetscSFDuplicate_Window;
976   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
977   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
978   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
979   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
980   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
981   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;
982 
983   PetscCall(PetscNew(&w));
984   sf->data  = (void *)w;
985   w->sync   = PETSCSF_WINDOW_SYNC_FENCE;
986   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
987   w->info   = MPI_INFO_NULL;
988 
989   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", PetscSFWindowSetSyncType_Window));
990   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", PetscSFWindowGetSyncType_Window));
991   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", PetscSFWindowSetFlavorType_Window));
992   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", PetscSFWindowGetFlavorType_Window));
993   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", PetscSFWindowSetInfo_Window));
994   PetscCall(PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", PetscSFWindowGetInfo_Window));
995 
996 #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 1 || (OMPI_MAJOR_VERSION == 1 && OMPI_MINOR_VERSION <= 6))
997   {
998     PetscBool ackbug = PETSC_FALSE;
999     PetscCall(PetscOptionsGetBool(NULL, NULL, "-acknowledge_ompi_onesided_bug", &ackbug, NULL));
1000     if (ackbug) {
1001       PetscCall(PetscInfo(sf, "Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n"));
1002     } else SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_LIB, "Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
1003   }
1004 #endif
1005   PetscFunctionReturn(PETSC_SUCCESS);
1006 }
1007