xref: /petsc/src/vec/is/sf/impls/window/sfwindow.c (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 
3 typedef struct _n_PetscSFDataLink *PetscSFDataLink;
4 typedef struct _n_PetscSFWinLink  *PetscSFWinLink;
5 
6 typedef struct {
7   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
8   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
9   PetscSFWinLink          wins;   /* List of active windows */
10   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
11   PetscSF                 dynsf;
12   MPI_Info                info;
13 } PetscSF_Window;
14 
15 struct _n_PetscSFDataLink {
16   MPI_Datatype    unit;
17   MPI_Datatype    *mine;
18   MPI_Datatype    *remote;
19   PetscSFDataLink next;
20 };
21 
22 struct _n_PetscSFWinLink {
23   PetscBool               inuse;
24   size_t                  bytes;
25   void                    *addr;
26   void                    *paddr;
27   MPI_Win                 win;
28   MPI_Request             *reqs;
29   PetscSFWindowFlavorType flavor;
30   MPI_Aint                *dyn_target_addr;
31   PetscBool               epoch;
32   PetscSFWinLink          next;
33 };
34 
35 const char *const PetscSFWindowSyncTypes[] = {"FENCE","LOCK","ACTIVE","PetscSFWindowSyncType","PETSCSF_WINDOW_SYNC_",NULL};
36 const char *const PetscSFWindowFlavorTypes[] = {"CREATE","DYNAMIC","ALLOCATE","SHARED","PetscSFWindowFlavorType","PETSCSF_WINDOW_FLAVOR_",NULL};
37 
38 /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
39 static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
40 {
41   PetscFunctionBegin;
42   if (*op == MPIU_SUM) *op = MPI_SUM;
43   else if (*op == MPIU_MAX) *op = MPI_MAX;
44   else if (*op == MPIU_MIN) *op = MPI_MIN;
45   PetscFunctionReturn(0);
46 }
47 
48 /*@C
49    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank
50 
51    Not Collective
52 
53    Input Parameters:
54 +  sf - star forest
55 -  unit - data type for each node
56 
57    Output Parameters:
58 +  localtypes - types describing part of local leaf buffer referencing each remote rank
59 -  remotetypes - types describing part of remote root buffer referenced for each remote rank
60 
61    Level: developer
62 
63 .seealso: PetscSFSetGraph(), PetscSFView()
64 @*/
65 static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf,MPI_Datatype unit,const MPI_Datatype **localtypes,const MPI_Datatype **remotetypes)
66 {
67   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
68   PetscSFDataLink   link;
69   PetscInt          i,nranks;
70   const PetscInt    *roffset,*rmine,*rremote;
71   const PetscMPIInt *ranks;
72 
73   PetscFunctionBegin;
74   /* Look for types in cache */
75   for (link=w->link; link; link=link->next) {
76     PetscBool match;
77     CHKERRQ(MPIPetsc_Type_compare(unit,link->unit,&match));
78     if (match) {
79       *localtypes  = link->mine;
80       *remotetypes = link->remote;
81       PetscFunctionReturn(0);
82     }
83   }
84 
85   /* Create new composite types for each send rank */
86   CHKERRQ(PetscSFGetRootRanks(sf,&nranks,&ranks,&roffset,&rmine,&rremote));
87   CHKERRQ(PetscNew(&link));
88   CHKERRMPI(MPI_Type_dup(unit,&link->unit));
89   CHKERRQ(PetscMalloc2(nranks,&link->mine,nranks,&link->remote));
90   for (i=0; i<nranks; i++) {
91     PetscInt    rcount = roffset[i+1] - roffset[i];
92     PetscMPIInt *rmine,*rremote;
93 #if !defined(PETSC_USE_64BIT_INDICES)
94     rmine   = sf->rmine + sf->roffset[i];
95     rremote = sf->rremote + sf->roffset[i];
96 #else
97     PetscInt j;
98     CHKERRQ(PetscMalloc2(rcount,&rmine,rcount,&rremote));
99     for (j=0; j<rcount; j++) {
100       CHKERRQ(PetscMPIIntCast(sf->rmine[sf->roffset[i]+j],rmine+j));
101       CHKERRQ(PetscMPIIntCast(sf->rremote[sf->roffset[i]+j],rremote+j));
102     }
103 #endif
104 
105     CHKERRMPI(MPI_Type_create_indexed_block(rcount,1,rmine,link->unit,&link->mine[i]));
106     CHKERRMPI(MPI_Type_create_indexed_block(rcount,1,rremote,link->unit,&link->remote[i]));
107 #if defined(PETSC_USE_64BIT_INDICES)
108     CHKERRQ(PetscFree2(rmine,rremote));
109 #endif
110     CHKERRMPI(MPI_Type_commit(&link->mine[i]));
111     CHKERRMPI(MPI_Type_commit(&link->remote[i]));
112   }
113   link->next = w->link;
114   w->link    = link;
115 
116   *localtypes  = link->mine;
117   *remotetypes = link->remote;
118   PetscFunctionReturn(0);
119 }
120 
121 /*@C
122    PetscSFWindowSetFlavorType - Set flavor type for MPI_Win creation
123 
124    Logically Collective
125 
126    Input Parameters:
127 +  sf - star forest for communication
128 -  flavor - flavor type
129 
130    Options Database Key:
131 .  -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see PetscSFWindowFlavorType)
132 
133    Level: advanced
134 
135    Notes: Windows reusage follow this rules:
136 
137      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create
138 
139      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
140        for i=1 to K
141          PetscSFOperationBegin(rootdata1,leafdata_whatever);
142          PetscSFOperationEnd(rootdata1,leafdata_whatever);
143          ...
144          PetscSFOperationBegin(rootdataN,leafdata_whatever);
145          PetscSFOperationEnd(rootdataN,leafdata_whatever);
146        endfor
147        The following pattern will instead raise an error
148          PetscSFOperationBegin(rootdata1,leafdata_whatever);
149          PetscSFOperationEnd(rootdata1,leafdata_whatever);
150          PetscSFOperationBegin(rank ? rootdata1 : rootdata2,leafdata_whatever);
151          PetscSFOperationEnd(rank ? rootdata1 : rootdata2,leafdata_whatever);
152 
153      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use
154 
155      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE
156 
157 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetFlavorType()
158 @*/
159 PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf,PetscSFWindowFlavorType flavor)
160 {
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
163   PetscValidLogicalCollectiveEnum(sf,flavor,2);
164   CHKERRQ(PetscTryMethod(sf,"PetscSFWindowSetFlavorType_C",(PetscSF,PetscSFWindowFlavorType),(sf,flavor)));
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType flavor)
169 {
170   PetscSF_Window *w = (PetscSF_Window*)sf->data;
171 
172   PetscFunctionBegin;
173   w->flavor = flavor;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@C
178    PetscSFWindowGetFlavorType - Get flavor type for PetscSF communication
179 
180    Logically Collective
181 
182    Input Parameter:
183 .  sf - star forest for communication
184 
185    Output Parameter:
186 .  flavor - flavor type
187 
188    Level: advanced
189 
190 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetFlavorType()
191 @*/
192 PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf,PetscSFWindowFlavorType *flavor)
193 {
194   PetscFunctionBegin;
195   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
196   PetscValidPointer(flavor,2);
197   CHKERRQ(PetscUseMethod(sf,"PetscSFWindowGetFlavorType_C",(PetscSF,PetscSFWindowFlavorType*),(sf,flavor)));
198   PetscFunctionReturn(0);
199 }
200 
201 static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType *flavor)
202 {
203   PetscSF_Window *w = (PetscSF_Window*)sf->data;
204 
205   PetscFunctionBegin;
206   *flavor = w->flavor;
207   PetscFunctionReturn(0);
208 }
209 
210 /*@C
211    PetscSFWindowSetSyncType - Set synchronization type for PetscSF communication
212 
213    Logically Collective
214 
215    Input Parameters:
216 +  sf - star forest for communication
217 -  sync - synchronization type
218 
219    Options Database Key:
220 .  -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see PetscSFWindowSyncType)
221 
222    Level: advanced
223 
224 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetSyncType()
225 @*/
226 PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf,PetscSFWindowSyncType sync)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
230   PetscValidLogicalCollectiveEnum(sf,sync,2);
231   CHKERRQ(PetscTryMethod(sf,"PetscSFWindowSetSyncType_C",(PetscSF,PetscSFWindowSyncType),(sf,sync)));
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf,PetscSFWindowSyncType sync)
236 {
237   PetscSF_Window *w = (PetscSF_Window*)sf->data;
238 
239   PetscFunctionBegin;
240   w->sync = sync;
241   PetscFunctionReturn(0);
242 }
243 
244 /*@C
245    PetscSFWindowGetSyncType - Get synchronization type for PetscSF communication
246 
247    Logically Collective
248 
249    Input Parameter:
250 .  sf - star forest for communication
251 
252    Output Parameter:
253 .  sync - synchronization type
254 
255    Level: advanced
256 
257 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetSyncType()
258 @*/
259 PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf,PetscSFWindowSyncType *sync)
260 {
261   PetscFunctionBegin;
262   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
263   PetscValidPointer(sync,2);
264   CHKERRQ(PetscUseMethod(sf,"PetscSFWindowGetSyncType_C",(PetscSF,PetscSFWindowSyncType*),(sf,sync)));
265   PetscFunctionReturn(0);
266 }
267 
268 static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf,PetscSFWindowSyncType *sync)
269 {
270   PetscSF_Window *w = (PetscSF_Window*)sf->data;
271 
272   PetscFunctionBegin;
273   *sync = w->sync;
274   PetscFunctionReturn(0);
275 }
276 
277 /*@C
278    PetscSFWindowSetInfo - Set the MPI_Info handle that will be used for subsequent windows allocation
279 
280    Logically Collective
281 
282    Input Parameters:
283 +  sf - star forest for communication
284 -  info - MPI_Info handle
285 
286    Level: advanced
287 
288    Notes: the info handle is duplicated with a call to MPI_Info_dup unless info = MPI_INFO_NULL.
289 
290 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetInfo()
291 @*/
292 PetscErrorCode PetscSFWindowSetInfo(PetscSF sf,MPI_Info info)
293 {
294   PetscFunctionBegin;
295   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
296   CHKERRQ(PetscTryMethod(sf,"PetscSFWindowSetInfo_C",(PetscSF,MPI_Info),(sf,info)));
297   PetscFunctionReturn(0);
298 }
299 
300 static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf,MPI_Info info)
301 {
302   PetscSF_Window *w = (PetscSF_Window*)sf->data;
303 
304   PetscFunctionBegin;
305   if (w->info != MPI_INFO_NULL) {
306     CHKERRMPI(MPI_Info_free(&w->info));
307   }
308   if (info != MPI_INFO_NULL) {
309     CHKERRMPI(MPI_Info_dup(info,&w->info));
310   }
311   PetscFunctionReturn(0);
312 }
313 
314 /*@C
315    PetscSFWindowGetInfo - Get the MPI_Info handle used for windows allocation
316 
317    Logically Collective
318 
319    Input Parameter:
320 .  sf - star forest for communication
321 
322    Output Parameter:
323 .  info - MPI_Info handle
324 
325    Level: advanced
326 
327    Notes: if PetscSFWindowSetInfo() has not be called, this returns MPI_INFO_NULL
328 
329 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetInfo()
330 @*/
331 PetscErrorCode PetscSFWindowGetInfo(PetscSF sf,MPI_Info *info)
332 {
333   PetscFunctionBegin;
334   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
335   PetscValidPointer(info,2);
336   CHKERRQ(PetscUseMethod(sf,"PetscSFWindowGetInfo_C",(PetscSF,MPI_Info*),(sf,info)));
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf,MPI_Info *info)
341 {
342   PetscSF_Window *w = (PetscSF_Window*)sf->data;
343 
344   PetscFunctionBegin;
345   *info = w->info;
346   PetscFunctionReturn(0);
347 }
348 
349 /*
350    PetscSFGetWindow - Get a window for use with a given data type
351 
352    Collective on PetscSF
353 
354    Input Parameters:
355 +  sf - star forest
356 .  unit - data type
357 .  array - array to be sent
358 .  sync - type of synchronization PetscSFWindowSyncType
359 .  epoch - PETSC_TRUE to acquire the window and start an epoch, PETSC_FALSE to just acquire the window
360 .  fenceassert - assert parameter for call to MPI_Win_fence(), if sync == PETSCSF_WINDOW_SYNC_FENCE
361 .  postassert - assert parameter for call to MPI_Win_post(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
362 -  startassert - assert parameter for call to MPI_Win_start(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
363 
364    Output Parameters:
365 +  target_disp - target_disp argument for RMA calls (significative for PETSCSF_WINDOW_FLAVOR_DYNAMIC only)
366 +  reqs - array of requests (significative for sync == PETSCSF_WINDOW_SYNC_LOCK only)
367 -  win - window
368 
369    Level: developer
370 .seealso: PetscSFGetRootRanks(), PetscSFWindowGetDataTypes()
371 */
372 static PetscErrorCode PetscSFGetWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscMPIInt postassert,PetscMPIInt startassert,const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
373 {
374   PetscSF_Window *w = (PetscSF_Window*)sf->data;
375   MPI_Aint       lb,lb_true,bytes,bytes_true;
376   PetscSFWinLink link;
377 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
378   MPI_Aint       winaddr;
379   PetscInt       nranks;
380 #endif
381   PetscBool      reuse = PETSC_FALSE, update = PETSC_FALSE;
382   PetscBool      dummy[2];
383   MPI_Aint       wsize;
384 
385   PetscFunctionBegin;
386   CHKERRMPI(MPI_Type_get_extent(unit,&lb,&bytes));
387   CHKERRMPI(MPI_Type_get_true_extent(unit,&lb_true,&bytes_true));
388   PetscCheckFalse(lb != 0 || lb_true != 0,PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with nonzero lower bound, write petsc-maint@mcs.anl.gov if you want this feature");
389   PetscCheckFalse(bytes != bytes_true,PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with modified extent, write petsc-maint@mcs.anl.gov if you want this feature");
390   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
391   for (link=w->wins; reuse && link; link=link->next) {
392     PetscBool winok = PETSC_FALSE;
393     if (w->flavor != link->flavor) continue;
394     switch (w->flavor) {
395     case PETSCSF_WINDOW_FLAVOR_DYNAMIC: /* check available matching array, error if in use (we additionally check that the matching condition is the same across processes) */
396       if (array == link->addr) {
397         if (PetscDefined(USE_DEBUG)) {
398           dummy[0] = PETSC_TRUE;
399           dummy[1] = PETSC_TRUE;
400           CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,dummy,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf)));
401           CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf)));
402           PetscCheckFalse(dummy[0] != dummy[1],PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
403         }
404         PetscCheck(!link->inuse,PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Window in use");
405         PetscCheckFalse(epoch && link->epoch,PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Window epoch not finished");
406         winok = PETSC_TRUE;
407         link->paddr = array;
408       } else if (PetscDefined(USE_DEBUG)) {
409         dummy[0] = PETSC_FALSE;
410         dummy[1] = PETSC_FALSE;
411         CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,dummy  ,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf)));
412         CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf)));
413         PetscCheckFalse(dummy[0] != dummy[1],PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
414       }
415       break;
416     case PETSCSF_WINDOW_FLAVOR_ALLOCATE: /* check available by matching size, allocate if in use */
417     case PETSCSF_WINDOW_FLAVOR_SHARED:
418       if (!link->inuse && bytes == (MPI_Aint)link->bytes) {
419         update = PETSC_TRUE;
420         link->paddr = array;
421         winok = PETSC_TRUE;
422       }
423       break;
424     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
425     }
426     if (winok) {
427       *win = link->win;
428       CHKERRQ(PetscInfo(sf,"Reusing window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf)));
429       goto found;
430     }
431   }
432 
433   wsize = (MPI_Aint)bytes*sf->nroots;
434   CHKERRQ(PetscNew(&link));
435   link->bytes           = bytes;
436   link->next            = w->wins;
437   link->flavor          = w->flavor;
438   link->dyn_target_addr = NULL;
439   link->reqs            = NULL;
440   w->wins               = link;
441   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
442     PetscInt i;
443 
444     CHKERRQ(PetscMalloc1(sf->nranks,&link->reqs));
445     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
446   }
447   switch (w->flavor) {
448   case PETSCSF_WINDOW_FLAVOR_CREATE:
449     CHKERRMPI(MPI_Win_create(array,wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->win));
450     link->addr  = array;
451     link->paddr = array;
452     break;
453 #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
454   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
455     CHKERRMPI(MPI_Win_create_dynamic(w->info,PetscObjectComm((PetscObject)sf),&link->win));
456 #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) /* some OpenMPI versions do not support MPI_Win_attach(win,NULL,0); */
457     int dummy = 0;
458     CHKERRMPI(MPI_Win_attach(link->win,wsize ? array : (void*)&dummy,wsize));
459 #else
460     CHKERRMPI(MPI_Win_attach(link->win,array,wsize));
461 #endif
462     link->addr  = array;
463     link->paddr = array;
464     PetscCheck(w->dynsf,PetscObjectComm((PetscObject)sf),PETSC_ERR_ORDER,"Must call PetscSFSetUp()");
465     CHKERRQ(PetscSFSetUp(w->dynsf));
466     CHKERRQ(PetscSFGetRootRanks(w->dynsf,&nranks,NULL,NULL,NULL,NULL));
467     CHKERRQ(PetscMalloc1(nranks,&link->dyn_target_addr));
468     CHKERRMPI(MPI_Get_address(array,&winaddr));
469     CHKERRQ(PetscSFBcastBegin(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr,MPI_REPLACE));
470     CHKERRQ(PetscSFBcastEnd(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr,MPI_REPLACE));
471     break;
472   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
473     CHKERRMPI(MPI_Win_allocate(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win));
474     update = PETSC_TRUE;
475     link->paddr = array;
476     break;
477 #endif
478 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
479   case PETSCSF_WINDOW_FLAVOR_SHARED:
480     CHKERRMPI(MPI_Win_allocate_shared(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win));
481     update = PETSC_TRUE;
482     link->paddr = array;
483     break;
484 #endif
485   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
486   }
487   CHKERRQ(PetscInfo(sf,"New window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf)));
488   *win = link->win;
489 
490 found:
491 
492   if (target_disp) *target_disp = link->dyn_target_addr;
493   if (reqs) *reqs = link->reqs;
494   if (update) { /* locks are needed for the "separate" memory model only, the fence guaranties memory-synchronization */
495     PetscMPIInt rank;
496 
497     CHKERRMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank));
498     if (sync == PETSCSF_WINDOW_SYNC_LOCK) CHKERRMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE,rank,MPI_MODE_NOCHECK,*win));
499     CHKERRQ(PetscMemcpy(link->addr,array,sf->nroots*bytes));
500     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
501       CHKERRMPI(MPI_Win_unlock(rank,*win));
502       CHKERRMPI(MPI_Win_fence(0,*win));
503     }
504   }
505   link->inuse = PETSC_TRUE;
506   link->epoch = epoch;
507   if (epoch) {
508     switch (sync) {
509     case PETSCSF_WINDOW_SYNC_FENCE:
510       CHKERRMPI(MPI_Win_fence(fenceassert,*win));
511       break;
512     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
513       break;
514     case PETSCSF_WINDOW_SYNC_ACTIVE: {
515       MPI_Group   ingroup,outgroup;
516       PetscMPIInt isize,osize;
517 
518       /* OpenMPI 4.0.2 with btl=vader does not like calling
519          - MPI_Win_complete when ogroup is empty
520          - MPI_Win_wait when igroup is empty
521          So, we do not even issue the corresponding start and post calls
522          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
523          start(outgroup) has a matching post(ingroup)
524          and this is guaranteed by PetscSF
525       */
526       CHKERRQ(PetscSFGetGroups(sf,&ingroup,&outgroup));
527       CHKERRMPI(MPI_Group_size(ingroup,&isize));
528       CHKERRMPI(MPI_Group_size(outgroup,&osize));
529       if (isize) CHKERRMPI(MPI_Win_post(ingroup,postassert,*win));
530       if (osize) CHKERRMPI(MPI_Win_start(outgroup,startassert,*win));
531     } break;
532     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
533     }
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 /*
539    PetscSFFindWindow - Finds a window that is already in use
540 
541    Not Collective
542 
543    Input Parameters:
544 +  sf - star forest
545 .  unit - data type
546 -  array - array with which the window is associated
547 
548    Output Parameters:
549 +  win - window
550 -  reqs - outstanding requests associated to the window
551 
552    Level: developer
553 
554 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
555 */
556 static PetscErrorCode PetscSFFindWindow(PetscSF sf,MPI_Datatype unit,const void *array,MPI_Win *win,MPI_Request **reqs)
557 {
558   PetscSF_Window *w = (PetscSF_Window*)sf->data;
559   PetscSFWinLink link;
560 
561   PetscFunctionBegin;
562   *win = MPI_WIN_NULL;
563   for (link=w->wins; link; link=link->next) {
564     if (array == link->paddr) {
565 
566       CHKERRQ(PetscInfo(sf,"Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf)));
567       *win = link->win;
568       *reqs = link->reqs;
569       PetscFunctionReturn(0);
570     }
571   }
572   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");
573 }
574 
575 /*
576    PetscSFRestoreWindow - Restores a window obtained with PetscSFGetWindow()
577 
578    Collective
579 
580    Input Parameters:
581 +  sf - star forest
582 .  unit - data type
583 .  array - array associated with window
584 .  sync - type of synchronization PetscSFWindowSyncType
585 .  epoch - close an epoch, must match argument to PetscSFGetWindow()
586 .  update - if we have to update the local window array
587 -  win - window
588 
589    Level: developer
590 
591 .seealso: PetscSFFindWindow()
592 */
593 static PetscErrorCode PetscSFRestoreWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscBool update,MPI_Win *win)
594 {
595   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
596   PetscSFWinLink          *p,link;
597   PetscBool               reuse = PETSC_FALSE;
598   PetscSFWindowFlavorType flavor;
599   void*                   laddr;
600   size_t                  bytes;
601 
602   PetscFunctionBegin;
603   for (p=&w->wins; *p; p=&(*p)->next) {
604     link = *p;
605     if (*win == link->win) {
606       PetscCheckFalse(array != link->paddr,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Matched window, but not array");
607       if (epoch != link->epoch) {
608         PetscCheck(!epoch,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"No epoch to end");
609         else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Restoring window without ending epoch");
610       }
611       laddr = link->addr;
612       flavor = link->flavor;
613       bytes = link->bytes;
614       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
615       else { *p = link->next; update = PETSC_FALSE; } /* remove from list */
616       goto found;
617     }
618   }
619   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");
620 
621 found:
622   CHKERRQ(PetscInfo(sf,"Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf)));
623   if (epoch) {
624     switch (sync) {
625     case PETSCSF_WINDOW_SYNC_FENCE:
626       CHKERRMPI(MPI_Win_fence(fenceassert,*win));
627       break;
628     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
629       break;
630     case PETSCSF_WINDOW_SYNC_ACTIVE: {
631       MPI_Group   ingroup,outgroup;
632       PetscMPIInt isize,osize;
633 
634       /* OpenMPI 4.0.2 with btl=wader does not like calling
635          - MPI_Win_complete when ogroup is empty
636          - MPI_Win_wait when igroup is empty
637          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
638          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
639          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
640       */
641       CHKERRQ(PetscSFGetGroups(sf,&ingroup,&outgroup));
642       CHKERRMPI(MPI_Group_size(ingroup,&isize));
643       CHKERRMPI(MPI_Group_size(outgroup,&osize));
644       if (osize) CHKERRMPI(MPI_Win_complete(*win));
645       if (isize) CHKERRMPI(MPI_Win_wait(*win));
646     } break;
647     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
648     }
649   }
650   if (update) {
651     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
652       CHKERRMPI(MPI_Win_fence(MPI_MODE_NOPUT|MPI_MODE_NOSUCCEED,*win));
653     }
654     CHKERRQ(PetscMemcpy(array,laddr,sf->nroots*bytes));
655   }
656   link->epoch = PETSC_FALSE;
657   link->inuse = PETSC_FALSE;
658   link->paddr = NULL;
659   if (!reuse) {
660     CHKERRQ(PetscFree(link->dyn_target_addr));
661     CHKERRQ(PetscFree(link->reqs));
662     CHKERRMPI(MPI_Win_free(&link->win));
663     CHKERRQ(PetscFree(link));
664     *win = MPI_WIN_NULL;
665   }
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
670 {
671   PetscSF_Window *w = (PetscSF_Window*)sf->data;
672   MPI_Group      ingroup,outgroup;
673 
674   PetscFunctionBegin;
675   CHKERRQ(PetscSFSetUpRanks(sf,MPI_GROUP_EMPTY));
676   if (!w->dynsf) {
677     PetscInt    i;
678     PetscSFNode *remotes;
679 
680     CHKERRQ(PetscMalloc1(sf->nranks,&remotes));
681     for (i=0;i<sf->nranks;i++) {
682       remotes[i].rank  = sf->ranks[i];
683       remotes[i].index = 0;
684     }
685     CHKERRQ(PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&w->dynsf));
686     CHKERRQ(PetscSFWindowSetFlavorType(w->dynsf,PETSCSF_WINDOW_FLAVOR_CREATE)); /* break recursion */
687     CHKERRQ(PetscSFSetGraph(w->dynsf,1,sf->nranks,NULL,PETSC_OWN_POINTER,remotes,PETSC_OWN_POINTER));
688     CHKERRQ(PetscLogObjectParent((PetscObject)sf,(PetscObject)w->dynsf));
689   }
690   switch (w->sync) {
691   case PETSCSF_WINDOW_SYNC_ACTIVE:
692     CHKERRQ(PetscSFGetGroups(sf,&ingroup,&outgroup));
693   default:
694     break;
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode PetscSFSetFromOptions_Window(PetscOptionItems *PetscOptionsObject,PetscSF sf)
700 {
701   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
702   PetscSFWindowFlavorType flavor = w->flavor;
703 
704   PetscFunctionBegin;
705   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"PetscSF Window options"));
706   CHKERRQ(PetscOptionsEnum("-sf_window_sync","synchronization type to use for PetscSF Window communication","PetscSFWindowSetSyncType",PetscSFWindowSyncTypes,(PetscEnum)w->sync,(PetscEnum*)&w->sync,NULL));
707   CHKERRQ(PetscOptionsEnum("-sf_window_flavor","flavor to use for PetscSF Window creation","PetscSFWindowSetFlavorType",PetscSFWindowFlavorTypes,(PetscEnum)flavor,(PetscEnum*)&flavor,NULL));
708   CHKERRQ(PetscSFWindowSetFlavorType(sf,flavor));
709   CHKERRQ(PetscOptionsTail());
710   PetscFunctionReturn(0);
711 }
712 
713 static PetscErrorCode PetscSFReset_Window(PetscSF sf)
714 {
715   PetscSF_Window  *w = (PetscSF_Window*)sf->data;
716   PetscSFDataLink link,next;
717   PetscSFWinLink  wlink,wnext;
718   PetscInt        i;
719 
720   PetscFunctionBegin;
721   for (link=w->link; link; link=next) {
722     next = link->next;
723     CHKERRMPI(MPI_Type_free(&link->unit));
724     for (i=0; i<sf->nranks; i++) {
725       CHKERRMPI(MPI_Type_free(&link->mine[i]));
726       CHKERRMPI(MPI_Type_free(&link->remote[i]));
727     }
728     CHKERRQ(PetscFree2(link->mine,link->remote));
729     CHKERRQ(PetscFree(link));
730   }
731   w->link = NULL;
732   for (wlink=w->wins; wlink; wlink=wnext) {
733     wnext = wlink->next;
734     PetscCheck(!wlink->inuse,PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Window still in use with address %p",(void*)wlink->addr);
735     CHKERRQ(PetscFree(wlink->dyn_target_addr));
736     CHKERRQ(PetscFree(wlink->reqs));
737     CHKERRMPI(MPI_Win_free(&wlink->win));
738     CHKERRQ(PetscFree(wlink));
739   }
740   w->wins = NULL;
741   CHKERRQ(PetscSFDestroy(&w->dynsf));
742   if (w->info != MPI_INFO_NULL) {
743     CHKERRMPI(MPI_Info_free(&w->info));
744   }
745   PetscFunctionReturn(0);
746 }
747 
748 static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
749 {
750   PetscFunctionBegin;
751   CHKERRQ(PetscSFReset_Window(sf));
752   CHKERRQ(PetscFree(sf->data));
753   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",NULL));
754   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",NULL));
755   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",NULL));
756   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",NULL));
757   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",NULL));
758   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",NULL));
759   PetscFunctionReturn(0);
760 }
761 
762 static PetscErrorCode PetscSFView_Window(PetscSF sf,PetscViewer viewer)
763 {
764   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
765   PetscBool         iascii;
766   PetscViewerFormat format;
767 
768   PetscFunctionBegin;
769   CHKERRQ(PetscViewerGetFormat(viewer,&format));
770   CHKERRQ(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
771   if (iascii) {
772     CHKERRQ(PetscViewerASCIIPrintf(viewer,"  current flavor=%s synchronization=%s MultiSF sort=%s\n",PetscSFWindowFlavorTypes[w->flavor],PetscSFWindowSyncTypes[w->sync],sf->rankorder ? "rank-order" : "unordered"));
773     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
774       if (w->info != MPI_INFO_NULL) {
775         PetscMPIInt k,nkeys;
776         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
777 
778         CHKERRMPI(MPI_Info_get_nkeys(w->info,&nkeys));
779         CHKERRQ(PetscViewerASCIIPrintf(viewer,"    current info with %d keys. Ordered key-value pairs follow:\n",nkeys));
780         for (k = 0; k < nkeys; k++) {
781           PetscMPIInt flag;
782 
783           CHKERRMPI(MPI_Info_get_nthkey(w->info,k,key));
784           CHKERRMPI(MPI_Info_get(w->info,key,MPI_MAX_INFO_VAL,value,&flag));
785           PetscCheck(flag,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing key %s",key);
786           CHKERRQ(PetscViewerASCIIPrintf(viewer,"      %s = %s\n",key,value));
787         }
788       } else {
789         CHKERRQ(PetscViewerASCIIPrintf(viewer,"    current info=MPI_INFO_NULL\n"));
790       }
791     }
792   }
793   PetscFunctionReturn(0);
794 }
795 
796 static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf,PetscSFDuplicateOption opt,PetscSF newsf)
797 {
798   PetscSF_Window        *w = (PetscSF_Window*)sf->data;
799   PetscSFWindowSyncType synctype;
800 
801   PetscFunctionBegin;
802   synctype = w->sync;
803   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
804   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
805   CHKERRQ(PetscSFWindowSetSyncType(newsf,synctype));
806   CHKERRQ(PetscSFWindowSetFlavorType(newsf,w->flavor));
807   CHKERRQ(PetscSFWindowSetInfo(newsf,w->info));
808   PetscFunctionReturn(0);
809 }
810 
811 static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
812 {
813   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
814   PetscInt           i,nranks;
815   const PetscMPIInt  *ranks;
816   const MPI_Aint     *target_disp;
817   const MPI_Datatype *mine,*remote;
818   MPI_Request        *reqs;
819   MPI_Win            win;
820 
821   PetscFunctionBegin;
822   PetscCheckFalse(op != MPI_REPLACE,PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PetscSFBcastBegin_Window with op!=MPI_REPLACE has not been implemented");
823   CHKERRQ(PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL));
824   CHKERRQ(PetscSFWindowGetDataTypes(sf,unit,&mine,&remote));
825   CHKERRQ(PetscSFGetWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPUT|MPI_MODE_NOPRECEDE,MPI_MODE_NOPUT,0,&target_disp,&reqs,&win));
826   for (i=0; i<nranks; i++) {
827     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
828 
829     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
830       CHKERRMPI(MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win));
831 #if defined(PETSC_HAVE_MPI_RGET)
832       CHKERRMPI(MPI_Rget(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win,&reqs[i]));
833 #else
834       CHKERRMPI(MPI_Get(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win));
835 #endif
836     } else {
837       CHKERRMPI(MPI_Get(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win));
838     }
839   }
840   PetscFunctionReturn(0);
841 }
842 
843 PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
844 {
845   PetscSF_Window *w = (PetscSF_Window*)sf->data;
846   MPI_Win        win;
847   MPI_Request    *reqs = NULL;
848 
849   PetscFunctionBegin;
850   CHKERRQ(PetscSFFindWindow(sf,unit,rootdata,&win,&reqs));
851   if (reqs) CHKERRMPI(MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE));
852   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
853     PetscInt           i,nranks;
854     const PetscMPIInt  *ranks;
855 
856     CHKERRQ(PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL));
857     for (i=0; i<nranks; i++) {
858       CHKERRMPI(MPI_Win_unlock(ranks[i],win));
859     }
860   }
861   CHKERRQ(PetscSFRestoreWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSTORE|MPI_MODE_NOSUCCEED,PETSC_FALSE,&win));
862   PetscFunctionReturn(0);
863 }
864 
865 PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
866 {
867   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
868   PetscInt           i,nranks;
869   const PetscMPIInt  *ranks;
870   const MPI_Aint     *target_disp;
871   const MPI_Datatype *mine,*remote;
872   MPI_Win            win;
873 
874   PetscFunctionBegin;
875   CHKERRQ(PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL));
876   CHKERRQ(PetscSFWindowGetDataTypes(sf,unit,&mine,&remote));
877   CHKERRQ(PetscSFWindowOpTranslate(&op));
878   CHKERRQ(PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,NULL,&win));
879   for (i=0; i<nranks; i++) {
880     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
881 
882     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) CHKERRMPI(MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win));
883     CHKERRMPI(MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win));
884     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) CHKERRMPI(MPI_Win_unlock(ranks[i],win));
885   }
886   PetscFunctionReturn(0);
887 }
888 
889 static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
890 {
891   PetscSF_Window *w = (PetscSF_Window*)sf->data;
892   MPI_Win        win;
893   MPI_Request    *reqs = NULL;
894 
895   PetscFunctionBegin;
896   CHKERRQ(PetscSFFindWindow(sf,unit,rootdata,&win,&reqs));
897   if (reqs) CHKERRMPI(MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE));
898   CHKERRQ(PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win));
899   PetscFunctionReturn(0);
900 }
901 
902 static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
903 {
904   PetscInt           i,nranks;
905   const PetscMPIInt  *ranks;
906   const MPI_Datatype *mine,*remote;
907   const MPI_Aint     *target_disp;
908   MPI_Win            win;
909   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
910 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
911   PetscSFWindowFlavorType oldf;
912 #endif
913 
914   PetscFunctionBegin;
915   CHKERRQ(PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL));
916   CHKERRQ(PetscSFWindowGetDataTypes(sf,unit,&mine,&remote));
917   CHKERRQ(PetscSFWindowOpTranslate(&op));
918 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
919   /* FetchAndOp without MPI_Get_Accumulate requires locking.
920      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
921   oldf = w->flavor;
922   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
923   CHKERRQ(PetscSFGetWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,0,0,&target_disp,NULL,&win));
924 #else
925   CHKERRQ(PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,NULL,&win));
926 #endif
927   for (i=0; i<nranks; i++) {
928     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
929 
930 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
931     CHKERRMPI(MPI_Win_lock(MPI_LOCK_EXCLUSIVE,ranks[i],0,win));
932     CHKERRMPI(MPI_Get(leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],win));
933     CHKERRMPI(MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win));
934     CHKERRMPI(MPI_Win_unlock(ranks[i],win));
935 #else
936     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) CHKERRMPI(MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],0,win));
937     CHKERRMPI(MPI_Get_accumulate((void*)leafdata,1,mine[i],leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],op,win));
938     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) CHKERRMPI(MPI_Win_unlock(ranks[i],win));
939 #endif
940   }
941 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
942   w->flavor = oldf;
943 #endif
944   PetscFunctionReturn(0);
945 }
946 
947 static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
948 {
949   MPI_Win        win;
950 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
951   PetscSF_Window *w = (PetscSF_Window*)sf->data;
952 #endif
953   MPI_Request    *reqs = NULL;
954 
955   PetscFunctionBegin;
956   CHKERRQ(PetscSFFindWindow(sf,unit,rootdata,&win,&reqs));
957   if (reqs) CHKERRMPI(MPI_Waitall(sf->nranks,reqs,MPI_STATUSES_IGNORE));
958 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
959   CHKERRQ(PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win));
960 #else
961   CHKERRQ(PetscSFRestoreWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,PETSC_TRUE,&win));
962 #endif
963   PetscFunctionReturn(0);
964 }
965 
966 PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
967 {
968   PetscSF_Window *w = (PetscSF_Window*)sf->data;
969 
970   PetscFunctionBegin;
971   sf->ops->SetUp           = PetscSFSetUp_Window;
972   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
973   sf->ops->Reset           = PetscSFReset_Window;
974   sf->ops->Destroy         = PetscSFDestroy_Window;
975   sf->ops->View            = PetscSFView_Window;
976   sf->ops->Duplicate       = PetscSFDuplicate_Window;
977   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
978   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
979   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
980   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
981   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
982   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;
983 
984   CHKERRQ(PetscNewLog(sf,&w));
985   sf->data  = (void*)w;
986   w->sync   = PETSCSF_WINDOW_SYNC_FENCE;
987   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
988   w->info   = MPI_INFO_NULL;
989 
990   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",PetscSFWindowSetSyncType_Window));
991   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",PetscSFWindowGetSyncType_Window));
992   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",PetscSFWindowSetFlavorType_Window));
993   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",PetscSFWindowGetFlavorType_Window));
994   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",PetscSFWindowSetInfo_Window));
995   CHKERRQ(PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",PetscSFWindowGetInfo_Window));
996 
997 #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 1 || (OMPI_MAJOR_VERSION == 1 && OMPI_MINOR_VERSION <= 6))
998   {
999     PetscBool ackbug = PETSC_FALSE;
1000     CHKERRQ(PetscOptionsGetBool(NULL,NULL,"-acknowledge_ompi_onesided_bug",&ackbug,NULL));
1001     if (ackbug) {
1002       CHKERRQ(PetscInfo(sf,"Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n"));
1003     } else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_LIB,"Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
1004   }
1005 #endif
1006   PetscFunctionReturn(0);
1007 }
1008