xref: /petsc/src/vec/is/sf/impls/window/sfwindow.c (revision 5b0d146a4c27a34a26fabb00e248697b7b651ba6)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 
3 typedef struct _n_PetscSFDataLink *PetscSFDataLink;
4 typedef struct _n_PetscSFWinLink  *PetscSFWinLink;
5 
6 typedef struct {
7   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
8   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
9   PetscSFWinLink          wins;   /* List of active windows */
10   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
11   PetscSF                 dynsf;
12   MPI_Info                info;
13 } PetscSF_Window;
14 
15 struct _n_PetscSFDataLink {
16   MPI_Datatype    unit;
17   MPI_Datatype    *mine;
18   MPI_Datatype    *remote;
19   PetscSFDataLink next;
20 };
21 
22 struct _n_PetscSFWinLink {
23   PetscBool               inuse;
24   size_t                  bytes;
25   void                    *addr;
26   void                    *paddr;
27   MPI_Win                 win;
28   PetscSFWindowFlavorType flavor;
29   MPI_Aint                *dyn_target_addr;
30   PetscBool               epoch;
31   PetscSFWinLink          next;
32 };
33 
34 const char *const PetscSFWindowSyncTypes[] = {"FENCE","LOCK","ACTIVE","PetscSFWindowSyncType","PETSCSF_WINDOW_SYNC_",0};
35 const char *const PetscSFWindowFlavorTypes[] = {"CREATE","DYNAMIC","ALLOCATE","SHARED","PetscSFWindowFlavorType","PETSCSF_WINDOW_FLAVOR_",0};
36 
37 /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPIU_Allreduce) */
38 static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
39 {
40   PetscFunctionBegin;
41   if (*op == MPIU_SUM) *op = MPI_SUM;
42   else if (*op == MPIU_MAX) *op = MPI_MAX;
43   else if (*op == MPIU_MIN) *op = MPI_MIN;
44   PetscFunctionReturn(0);
45 }
46 
47 /*@C
48    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank
49 
50    Not Collective
51 
52    Input Arguments:
53 +  sf - star forest
54 -  unit - data type for each node
55 
56    Output Arguments:
57 +  localtypes - types describing part of local leaf buffer referencing each remote rank
58 -  remotetypes - types describing part of remote root buffer referenced for each remote rank
59 
60    Level: developer
61 
62 .seealso: PetscSFSetGraph(), PetscSFView()
63 @*/
64 static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf,MPI_Datatype unit,const MPI_Datatype **localtypes,const MPI_Datatype **remotetypes)
65 {
66   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
67   PetscErrorCode    ierr;
68   PetscSFDataLink   link;
69   PetscInt          i,nranks;
70   const PetscInt    *roffset,*rmine,*rremote;
71   const PetscMPIInt *ranks;
72 
73   PetscFunctionBegin;
74   /* Look for types in cache */
75   for (link=w->link; link; link=link->next) {
76     PetscBool match;
77     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
78     if (match) {
79       *localtypes  = link->mine;
80       *remotetypes = link->remote;
81       PetscFunctionReturn(0);
82     }
83   }
84 
85   /* Create new composite types for each send rank */
86   ierr = PetscSFGetRootRanks(sf,&nranks,&ranks,&roffset,&rmine,&rremote);CHKERRQ(ierr);
87   ierr = PetscNew(&link);CHKERRQ(ierr);
88   ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);
89   ierr = PetscMalloc2(nranks,&link->mine,nranks,&link->remote);CHKERRQ(ierr);
90   for (i=0; i<nranks; i++) {
91     PetscInt    rcount = roffset[i+1] - roffset[i];
92     PetscMPIInt *rmine,*rremote;
93 #if !defined(PETSC_USE_64BIT_INDICES)
94     rmine   = sf->rmine + sf->roffset[i];
95     rremote = sf->rremote + sf->roffset[i];
96 #else
97     PetscInt j;
98     ierr = PetscMalloc2(rcount,&rmine,rcount,&rremote);CHKERRQ(ierr);
99     for (j=0; j<rcount; j++) {
100       ierr = PetscMPIIntCast(sf->rmine[sf->roffset[i]+j],rmine+j);CHKERRQ(ierr);
101       ierr = PetscMPIIntCast(sf->rremote[sf->roffset[i]+j],rremote+j);CHKERRQ(ierr);
102     }
103 #endif
104 
105     ierr = MPI_Type_create_indexed_block(rcount,1,rmine,link->unit,&link->mine[i]);CHKERRQ(ierr);
106     ierr = MPI_Type_create_indexed_block(rcount,1,rremote,link->unit,&link->remote[i]);CHKERRQ(ierr);
107 #if defined(PETSC_USE_64BIT_INDICES)
108     ierr = PetscFree2(rmine,rremote);CHKERRQ(ierr);
109 #endif
110     ierr = MPI_Type_commit(&link->mine[i]);CHKERRQ(ierr);
111     ierr = MPI_Type_commit(&link->remote[i]);CHKERRQ(ierr);
112   }
113   link->next = w->link;
114   w->link    = link;
115 
116   *localtypes  = link->mine;
117   *remotetypes = link->remote;
118   PetscFunctionReturn(0);
119 }
120 
121 /*@C
122    PetscSFWindowSetFlavorType - Set flavor type for MPI_Win creation
123 
124    Logically Collective
125 
126    Input Arguments:
127 +  sf - star forest for communication
128 -  flavor - flavor type
129 
130    Options Database Key:
131 .  -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see PetscSFWindowFlavorType)
132 
133    Level: advanced
134 
135    Notes: Windows reusage follow this rules:
136 
137      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create
138 
139      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
140        for i=1 to K
141          PetscSFOperationBegin(rootdata1,leafdata_whatever);
142          PetscSFOperationEnd(rootdata1,leafdata_whatever);
143          ...
144          PetscSFOperationBegin(rootdataN,leafdata_whatever);
145          PetscSFOperationEnd(rootdataN,leafdata_whatever);
146        endfor
147        The following pattern will instead raise an error
148          PetscSFOperationBegin(rootdata1,leafdata_whatever);
149          PetscSFOperationEnd(rootdata1,leafdata_whatever);
150          PetscSFOperationBegin(rank ? rootdata1 : rootdata2,leafdata_whatever);
151          PetscSFOperationEnd(rank ? rootdata1 : rootdata2,leafdata_whatever);
152 
153      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use
154 
155      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE
156 
157 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetFlavorType()
158 @*/
159 PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf,PetscSFWindowFlavorType flavor)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
165   PetscValidLogicalCollectiveEnum(sf,flavor,2);
166   ierr = PetscTryMethod(sf,"PetscSFWindowSetFlavorType_C",(PetscSF,PetscSFWindowFlavorType),(sf,flavor));CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType flavor)
171 {
172   PetscSF_Window *w = (PetscSF_Window*)sf->data;
173 
174   PetscFunctionBegin;
175   w->flavor = flavor;
176   PetscFunctionReturn(0);
177 }
178 
179 /*@C
180    PetscSFWindowGetFlavorType - Get flavor type for PetscSF communication
181 
182    Logically Collective
183 
184    Input Argument:
185 .  sf - star forest for communication
186 
187    Output Argument:
188 .  flavor - flavor type
189 
190    Level: advanced
191 
192 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetFlavorType()
193 @*/
194 PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf,PetscSFWindowFlavorType *flavor)
195 {
196   PetscErrorCode ierr;
197 
198   PetscFunctionBegin;
199   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
200   PetscValidPointer(flavor,2);
201   ierr = PetscUseMethod(sf,"PetscSFWindowGetFlavorType_C",(PetscSF,PetscSFWindowFlavorType*),(sf,flavor));CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf,PetscSFWindowFlavorType *flavor)
206 {
207   PetscSF_Window *w = (PetscSF_Window*)sf->data;
208 
209   PetscFunctionBegin;
210   *flavor = w->flavor;
211   PetscFunctionReturn(0);
212 }
213 
214 /*@C
215    PetscSFWindowSetSyncType - Set synchronization type for PetscSF communication
216 
217    Logically Collective
218 
219    Input Arguments:
220 +  sf - star forest for communication
221 -  sync - synchronization type
222 
223    Options Database Key:
224 .  -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see PetscSFWindowSyncType)
225 
226    Level: advanced
227 
228 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetSyncType()
229 @*/
230 PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf,PetscSFWindowSyncType sync)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
236   PetscValidLogicalCollectiveEnum(sf,sync,2);
237   ierr = PetscTryMethod(sf,"PetscSFWindowSetSyncType_C",(PetscSF,PetscSFWindowSyncType),(sf,sync));CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf,PetscSFWindowSyncType sync)
242 {
243   PetscSF_Window *w = (PetscSF_Window*)sf->data;
244 
245   PetscFunctionBegin;
246   w->sync = sync;
247   PetscFunctionReturn(0);
248 }
249 
250 /*@C
251    PetscSFWindowGetSyncType - Get synchronization type for PetscSF communication
252 
253    Logically Collective
254 
255    Input Argument:
256 .  sf - star forest for communication
257 
258    Output Argument:
259 .  sync - synchronization type
260 
261    Level: advanced
262 
263 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetSyncType()
264 @*/
265 PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf,PetscSFWindowSyncType *sync)
266 {
267   PetscErrorCode ierr;
268 
269   PetscFunctionBegin;
270   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
271   PetscValidPointer(sync,2);
272   ierr = PetscUseMethod(sf,"PetscSFWindowGetSyncType_C",(PetscSF,PetscSFWindowSyncType*),(sf,sync));CHKERRQ(ierr);
273   PetscFunctionReturn(0);
274 }
275 
276 static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf,PetscSFWindowSyncType *sync)
277 {
278   PetscSF_Window *w = (PetscSF_Window*)sf->data;
279 
280   PetscFunctionBegin;
281   *sync = w->sync;
282   PetscFunctionReturn(0);
283 }
284 
285 /*@C
286    PetscSFWindowSetInfo - Set the MPI_Info handle that will be used for subsequent windows allocation
287 
288    Logically Collective
289 
290    Input Argument:
291 +  sf - star forest for communication
292 -  info - MPI_Info handle
293 
294    Level: advanced
295 
296    Notes: the info handle is duplicated with a call to MPI_Info_dup unless info = MPI_INFO_NULL.
297 
298 .seealso: PetscSFSetFromOptions(), PetscSFWindowGetInfo()
299 @*/
300 PetscErrorCode PetscSFWindowSetInfo(PetscSF sf,MPI_Info info)
301 {
302   PetscErrorCode ierr;
303 
304   PetscFunctionBegin;
305   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
306   ierr = PetscTryMethod(sf,"PetscSFWindowSetInfo_C",(PetscSF,MPI_Info),(sf,info));CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf,MPI_Info info)
311 {
312   PetscSF_Window *w = (PetscSF_Window*)sf->data;
313   PetscErrorCode ierr;
314 
315   PetscFunctionBegin;
316   if (w->info != MPI_INFO_NULL) {
317     ierr = MPI_Info_free(&w->info);CHKERRQ(ierr);
318   }
319   if (info != MPI_INFO_NULL) {
320     ierr = MPI_Info_dup(info,&w->info);CHKERRQ(ierr);
321   }
322   PetscFunctionReturn(0);
323 }
324 
325 /*@C
326    PetscSFWindowGetInfo - Get the MPI_Info handle used for windows allocation
327 
328    Logically Collective
329 
330    Input Argument:
331 .  sf - star forest for communication
332 
333    Output Argument:
334 .  info - MPI_Info handle
335 
336    Level: advanced
337 
338    Notes: if PetscSFWindowSetInfo() has not be called, this returns MPI_INFO_NULL
339 
340 .seealso: PetscSFSetFromOptions(), PetscSFWindowSetInfo()
341 @*/
342 PetscErrorCode PetscSFWindowGetInfo(PetscSF sf,MPI_Info *info)
343 {
344   PetscErrorCode ierr;
345 
346   PetscFunctionBegin;
347   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
348   PetscValidPointer(info,2);
349   ierr = PetscUseMethod(sf,"PetscSFWindowGetInfo_C",(PetscSF,MPI_Info*),(sf,info));CHKERRQ(ierr);
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf,MPI_Info *info)
354 {
355   PetscSF_Window *w = (PetscSF_Window*)sf->data;
356 
357   PetscFunctionBegin;
358   *info = w->info;
359   PetscFunctionReturn(0);
360 }
361 
362 /*
363    PetscSFGetWindow - Get a window for use with a given data type
364 
365    Collective on PetscSF
366 
367    Input Arguments:
368 +  sf - star forest
369 .  unit - data type
370 .  array - array to be sent
371 .  sync - type of synchronization PetscSFWindowSyncType
372 .  epoch - PETSC_TRUE to acquire the window and start an epoch, PETSC_FALSE to just acquire the window
373 .  fenceassert - assert parameter for call to MPI_Win_fence(), if sync == PETSCSF_WINDOW_SYNC_FENCE
374 .  postassert - assert parameter for call to MPI_Win_post(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
375 .  startassert - assert parameter for call to MPI_Win_start(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
376 -  target_disp - target_disp argument to RMA calls (significative for PETSCSF_WINDOW_FLAVOR_DYNAMIC flavor only)
377 
378    Output Arguments:
379 .  win - window
380 
381    Level: developer
382 .seealso: PetscSFGetRootRanks(), PetscSFWindowGetDataTypes()
383 */
384 static PetscErrorCode PetscSFGetWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscMPIInt postassert,PetscMPIInt startassert,const MPI_Aint **target_disp, MPI_Win *win)
385 {
386   PetscSF_Window *w = (PetscSF_Window*)sf->data;
387   PetscErrorCode ierr;
388   MPI_Aint       lb,lb_true,bytes,bytes_true;
389   PetscSFWinLink link;
390   MPI_Aint       winaddr;
391   PetscInt       nranks;
392   PetscBool      reuse = PETSC_FALSE, update = PETSC_FALSE;
393 #if defined(PETSC_USE_DEBUG)
394   PetscBool      dummy[2];
395 #endif
396   MPI_Aint       wsize;
397 
398   PetscFunctionBegin;
399   ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr);
400   ierr = MPI_Type_get_true_extent(unit,&lb_true,&bytes_true);CHKERRQ(ierr);
401   if (lb != 0 || lb_true != 0) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with nonzero lower bound, write petsc-maint@mcs.anl.gov if you want this feature");
402   if (bytes != bytes_true) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for unit type with modified extent, write petsc-maint@mcs.anl.gov if you want this feature");
403   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
404   for (link=w->wins; reuse && link; link=link->next) {
405     PetscBool winok = PETSC_FALSE;
406     if (w->flavor != link->flavor) continue;
407     switch (w->flavor) {
408     case PETSCSF_WINDOW_FLAVOR_DYNAMIC: /* check available matching array, error if in use (we additionally check that the matching condition is the same across processes) */
409       if (array == link->addr) {
410 #if defined(PETSC_USE_DEBUG)
411         dummy[0] = PETSC_TRUE;
412         dummy[1] = PETSC_TRUE;
413         ierr = MPI_Allreduce(MPI_IN_PLACE,dummy  ,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
414         ierr = MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
415         if (dummy[0] != dummy[1]) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
416 #endif
417         if (link->inuse) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Window in use");
418         if (epoch && link->epoch) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Window epoch not finished");
419         winok = PETSC_TRUE;
420         link->paddr = array;
421 #if defined(PETSC_USE_DEBUG)
422       } else {
423         dummy[0] = PETSC_FALSE;
424         dummy[1] = PETSC_FALSE;
425         ierr = MPI_Allreduce(MPI_IN_PLACE,dummy  ,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
426         ierr = MPI_Allreduce(MPI_IN_PLACE,dummy+1,1,MPIU_BOOL,MPI_LOR ,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
427         if (dummy[0] != dummy[1]) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"PETSCSF_WINDOW_FLAVOR_DYNAMIC requires root pointers to be consistently used across the comm. Use PETSCSF_WINDOW_FLAVOR_CREATE or PETSCSF_WINDOW_FLAVOR_ALLOCATE instead");
428 #endif
429       }
430       break;
431     case PETSCSF_WINDOW_FLAVOR_ALLOCATE: /* check available by matching size, allocate if in use */
432     case PETSCSF_WINDOW_FLAVOR_SHARED:
433       if (!link->inuse && bytes == (MPI_Aint)link->bytes) {
434         update = PETSC_TRUE;
435         link->paddr = array;
436         winok = PETSC_TRUE;
437       }
438       break;
439     default: SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
440     }
441     if (winok) {
442       *win = link->win;
443       ierr = PetscInfo3(sf,"Reusing window %d of flavor %d for comm %d\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
444       goto found;
445     }
446   }
447 
448   wsize = (MPI_Aint)bytes*sf->nroots;
449   ierr = PetscNew(&link);CHKERRQ(ierr);
450   link->bytes           = bytes;
451   link->next            = w->wins;
452   link->flavor          = w->flavor;
453   link->dyn_target_addr = NULL;
454   w->wins               = link;
455   switch (w->flavor) {
456   case PETSCSF_WINDOW_FLAVOR_CREATE:
457     ierr = MPI_Win_create(array,wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->win);CHKERRQ(ierr);
458     link->addr  = array;
459     link->paddr = array;
460     break;
461   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
462     ierr = MPI_Win_create_dynamic(w->info,PetscObjectComm((PetscObject)sf),&link->win);CHKERRQ(ierr);
463 #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) /* some OpenMPI versions do not support MPI_Win_attach(win,NULL,0); */
464     ierr = MPI_Win_attach(link->win,wsize ? array : &ierr,wsize);CHKERRQ(ierr);
465 #else
466     ierr = MPI_Win_attach(link->win,array,wsize);CHKERRQ(ierr);
467 #endif
468     link->addr  = array;
469     link->paddr = array;
470     if (!w->dynsf) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ORDER,"Must call PetscSFSetUp()");
471     ierr = PetscSFSetUp(w->dynsf);CHKERRQ(ierr);
472     ierr = PetscSFGetRootRanks(w->dynsf,&nranks,NULL,NULL,NULL,NULL);CHKERRQ(ierr);
473     ierr = PetscMalloc1(nranks,&link->dyn_target_addr);CHKERRQ(ierr);
474     ierr = MPI_Get_address(array,&winaddr);CHKERRQ(ierr);
475     ierr = PetscSFBcastBegin(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr);CHKERRQ(ierr);
476     ierr = PetscSFBcastEnd(w->dynsf,MPI_AINT,&winaddr,link->dyn_target_addr);CHKERRQ(ierr);
477     break;
478   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
479     ierr = MPI_Win_allocate(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win);CHKERRQ(ierr);
480     update = PETSC_TRUE;
481     link->paddr = array;
482     break;
483 #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
484   case PETSCSF_WINDOW_FLAVOR_SHARED:
485     ierr = MPI_Win_allocate_shared(wsize,(PetscMPIInt)bytes,w->info,PetscObjectComm((PetscObject)sf),&link->addr,&link->win);CHKERRQ(ierr);
486     update = PETSC_TRUE;
487     link->paddr = array;
488     break;
489 #endif
490   default: SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for flavor %s",PetscSFWindowFlavorTypes[w->flavor]);
491   }
492   ierr = PetscInfo3(sf,"New window %d of flavor %d for comm %d\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
493   *win = link->win;
494 
495 found:
496 
497   if (update) {
498     ierr = PetscMemcpy(link->addr,array,sf->nroots*bytes);CHKERRQ(ierr);
499     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
500       ierr = MPI_Win_fence(0,*win);CHKERRQ(ierr);
501     }
502   }
503   link->inuse = PETSC_TRUE;
504   link->epoch = epoch;
505   *target_disp = link->dyn_target_addr;
506   if (epoch) {
507     switch (sync) {
508     case PETSCSF_WINDOW_SYNC_FENCE:
509       ierr = MPI_Win_fence(fenceassert,*win);CHKERRQ(ierr);
510       break;
511     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
512       break;
513     case PETSCSF_WINDOW_SYNC_ACTIVE: {
514       MPI_Group   ingroup,outgroup;
515       PetscMPIInt isize,osize;
516 
517       /* OpenMPI 4.0.2 with btl=vader does not like calling
518          - MPI_Win_complete when ogroup is empty
519          - MPI_Win_wait when igroup is empty
520          So, we do not even issue the corresponding start and post calls
521          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
522          start(outgroup) has a matching post(ingroup)
523          and this is guaranteed by PetscSF
524       */
525       ierr = PetscSFGetGroups(sf,&ingroup,&outgroup);CHKERRQ(ierr);
526       ierr = MPI_Group_size(ingroup,&isize);CHKERRQ(ierr);
527       ierr = MPI_Group_size(outgroup,&osize);CHKERRQ(ierr);
528       if (isize) { ierr = MPI_Win_post(ingroup,postassert,*win);CHKERRQ(ierr); }
529       if (osize) { ierr = MPI_Win_start(outgroup,startassert,*win);CHKERRQ(ierr); }
530     } break;
531     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
532     }
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 /*
538    PetscSFFindWindow - Finds a window that is already in use
539 
540    Not Collective
541 
542    Input Arguments:
543 +  sf - star forest
544 .  unit - data type
545 -  array - array with which the window is associated
546 
547    Output Arguments:
548 .  win - window
549 
550    Level: developer
551 
552 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
553 */
554 static PetscErrorCode PetscSFFindWindow(PetscSF sf,MPI_Datatype unit,const void *array,MPI_Win *win)
555 {
556   PetscSF_Window *w = (PetscSF_Window*)sf->data;
557   PetscSFWinLink link;
558   PetscErrorCode ierr;
559 
560   PetscFunctionBegin;
561   *win = MPI_WIN_NULL;
562   for (link=w->wins; link; link=link->next) {
563     if (array == link->paddr) {
564       ierr = PetscInfo3(sf,"Window %d of flavor %d for comm %d\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
565       *win = link->win;
566       PetscFunctionReturn(0);
567     }
568   }
569   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");
570   PetscFunctionReturn(0);
571 }
572 
573 /*
574    PetscSFRestoreWindow - Restores a window obtained with PetscSFGetWindow()
575 
576    Collective
577 
578    Input Arguments:
579 +  sf - star forest
580 .  unit - data type
581 .  array - array associated with window
582 .  sync - type of synchronization PetscSFWindowSyncType
583 .  epoch - close an epoch, must match argument to PetscSFGetWindow()
584 .  update - if we have to update the local window array
585 -  win - window
586 
587    Level: developer
588 
589 .seealso: PetscSFFindWindow()
590 */
591 static PetscErrorCode PetscSFRestoreWindow(PetscSF sf,MPI_Datatype unit,void *array,PetscSFWindowSyncType sync,PetscBool epoch,PetscMPIInt fenceassert,PetscBool update,MPI_Win *win)
592 {
593   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
594   PetscErrorCode          ierr;
595   PetscSFWinLink          *p,link;
596   PetscBool               reuse = PETSC_FALSE;
597   PetscSFWindowFlavorType flavor;
598   void*                   laddr;
599   size_t                  bytes;
600 
601   PetscFunctionBegin;
602   for (p=&w->wins; *p; p=&(*p)->next) {
603     link = *p;
604     if (*win == link->win) {
605       if (array != link->paddr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Matched window, but not array");
606       if (epoch != link->epoch) {
607         if (epoch) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"No epoch to end");
608         else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Restoring window without ending epoch");
609       }
610       laddr = link->addr;
611       flavor = link->flavor;
612       bytes = link->bytes;
613       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
614       else { *p = link->next; update = PETSC_FALSE; } /* remove from list */
615       goto found;
616     }
617   }
618   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Requested window not in use");
619 
620 found:
621   ierr = PetscInfo3(sf,"Window %d of flavor %d for comm %d\n",link->win,link->flavor,PetscObjectComm((PetscObject)sf));CHKERRQ(ierr);
622   if (epoch) {
623     switch (sync) {
624     case PETSCSF_WINDOW_SYNC_FENCE:
625       ierr = MPI_Win_fence(fenceassert,*win);CHKERRQ(ierr);
626       break;
627     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
628       break;
629     case PETSCSF_WINDOW_SYNC_ACTIVE: {
630       MPI_Group   ingroup,outgroup;
631       PetscMPIInt isize,osize;
632 
633       /* OpenMPI 4.0.2 with btl=wader does not like calling
634          - MPI_Win_complete when ogroup is empty
635          - MPI_Win_wait when igroup is empty
636          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
637          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
638          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
639       */
640       ierr = PetscSFGetGroups(sf,&ingroup,&outgroup);CHKERRQ(ierr);
641       ierr = MPI_Group_size(ingroup,&isize);CHKERRQ(ierr);
642       ierr = MPI_Group_size(outgroup,&osize);CHKERRQ(ierr);
643       if (osize) { ierr = MPI_Win_complete(*win);CHKERRQ(ierr); }
644       if (isize) { ierr = MPI_Win_wait(*win);CHKERRQ(ierr); }
645     } break;
646     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"Unknown synchronization type");
647     }
648   }
649   if (update) {
650     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
651       ierr = MPI_Win_fence(MPI_MODE_NOPUT|MPI_MODE_NOSUCCEED,*win);CHKERRQ(ierr);
652     }
653     ierr = PetscMemcpy(array,laddr,sf->nroots*bytes);CHKERRQ(ierr);
654   }
655   link->epoch = PETSC_FALSE;
656   link->inuse = PETSC_FALSE;
657   link->paddr = NULL;
658   if (!reuse) {
659     ierr = MPI_Win_free(&link->win);CHKERRQ(ierr);
660     ierr = PetscFree(link);CHKERRQ(ierr);
661     *win = MPI_WIN_NULL;
662   }
663   PetscFunctionReturn(0);
664 }
665 
666 static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
667 {
668   PetscSF_Window *w = (PetscSF_Window*)sf->data;
669   PetscErrorCode ierr;
670   MPI_Group      ingroup,outgroup;
671 
672   PetscFunctionBegin;
673   ierr = PetscSFSetUpRanks(sf,MPI_GROUP_EMPTY);CHKERRQ(ierr);
674   if (!w->dynsf) {
675     PetscInt    i;
676     PetscSFNode *remotes;
677 
678     ierr = PetscMalloc1(sf->nranks,&remotes);CHKERRQ(ierr);
679     for (i=0;i<sf->nranks;i++) {
680       remotes[i].rank  = sf->ranks[i];
681       remotes[i].index = 0;
682     }
683     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&w->dynsf);CHKERRQ(ierr);
684     ierr = PetscSFWindowSetFlavorType(w->dynsf,PETSCSF_WINDOW_FLAVOR_CREATE);CHKERRQ(ierr); /* break recursion */
685     ierr = PetscSFSetGraph(w->dynsf,1,sf->nranks,NULL,PETSC_OWN_POINTER,remotes,PETSC_OWN_POINTER);CHKERRQ(ierr);
686     ierr = PetscLogObjectParent((PetscObject)sf,(PetscObject)w->dynsf);CHKERRQ(ierr);
687   }
688   switch (w->sync) {
689   case PETSCSF_WINDOW_SYNC_ACTIVE:
690     ierr = PetscSFGetGroups(sf,&ingroup,&outgroup);CHKERRQ(ierr);
691   default:
692     break;
693   }
694   PetscFunctionReturn(0);
695 }
696 
697 static PetscErrorCode PetscSFSetFromOptions_Window(PetscOptionItems *PetscOptionsObject,PetscSF sf)
698 {
699   PetscSF_Window          *w = (PetscSF_Window*)sf->data;
700   PetscErrorCode          ierr;
701   PetscSFWindowFlavorType flavor = w->flavor;
702 
703   PetscFunctionBegin;
704   ierr = PetscOptionsHead(PetscOptionsObject,"PetscSF Window options");CHKERRQ(ierr);
705   ierr = PetscOptionsEnum("-sf_window_sync","synchronization type to use for PetscSF Window communication","PetscSFWindowSetSyncType",PetscSFWindowSyncTypes,(PetscEnum)w->sync,(PetscEnum*)&w->sync,NULL);CHKERRQ(ierr);
706   ierr = PetscOptionsEnum("-sf_window_flavor","flavor to use for PetscSF Window creation","PetscSFWindowSetFlavorType",PetscSFWindowFlavorTypes,(PetscEnum)flavor,(PetscEnum*)&flavor,NULL);CHKERRQ(ierr);
707   ierr = PetscSFWindowSetFlavorType(sf,flavor);CHKERRQ(ierr);
708   ierr = PetscOptionsTail();CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 static PetscErrorCode PetscSFReset_Window(PetscSF sf)
713 {
714   PetscSF_Window  *w = (PetscSF_Window*)sf->data;
715   PetscErrorCode  ierr;
716   PetscSFDataLink link,next;
717   PetscSFWinLink  wlink,wnext;
718   PetscInt        i;
719 
720   PetscFunctionBegin;
721   for (link=w->link; link; link=next) {
722     next = link->next;
723     ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);
724     for (i=0; i<sf->nranks; i++) {
725       ierr = MPI_Type_free(&link->mine[i]);CHKERRQ(ierr);
726       ierr = MPI_Type_free(&link->remote[i]);CHKERRQ(ierr);
727     }
728     ierr = PetscFree2(link->mine,link->remote);CHKERRQ(ierr);
729     ierr = PetscFree(link);CHKERRQ(ierr);
730   }
731   w->link = NULL;
732   for (wlink=w->wins; wlink; wlink=wnext) {
733     wnext = wlink->next;
734     if (wlink->inuse) SETERRQ1(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Window still in use with address %p",(void*)wlink->addr);
735     ierr = PetscFree(wlink->dyn_target_addr);CHKERRQ(ierr);
736     ierr = MPI_Win_free(&wlink->win);CHKERRQ(ierr);
737     ierr = PetscFree(wlink);CHKERRQ(ierr);
738   }
739   w->wins = NULL;
740   ierr = PetscSFDestroy(&w->dynsf);CHKERRQ(ierr);
741   if (w->info != MPI_INFO_NULL) {
742     ierr = MPI_Info_free(&w->info);CHKERRQ(ierr);
743   }
744   PetscFunctionReturn(0);
745 }
746 
747 static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
748 {
749   PetscErrorCode ierr;
750 
751   PetscFunctionBegin;
752   ierr = PetscSFReset_Window(sf);CHKERRQ(ierr);
753   ierr = PetscFree(sf->data);CHKERRQ(ierr);
754   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",NULL);CHKERRQ(ierr);
755   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",NULL);CHKERRQ(ierr);
756   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",NULL);CHKERRQ(ierr);
757   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",NULL);CHKERRQ(ierr);
758   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",NULL);CHKERRQ(ierr);
759   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",NULL);CHKERRQ(ierr);
760   PetscFunctionReturn(0);
761 }
762 
763 static PetscErrorCode PetscSFView_Window(PetscSF sf,PetscViewer viewer)
764 {
765   PetscSF_Window    *w = (PetscSF_Window*)sf->data;
766   PetscErrorCode    ierr;
767   PetscBool         iascii;
768   PetscViewerFormat format;
769 
770   PetscFunctionBegin;
771   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
772   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
773   if (iascii) {
774     ierr = PetscViewerASCIIPrintf(viewer,"  current flavor=%s synchronization=%s sort=%s\n",PetscSFWindowFlavorTypes[w->flavor],PetscSFWindowSyncTypes[w->sync],sf->rankorder ? "rank-order" : "unordered");CHKERRQ(ierr);
775     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
776       if (w->info != MPI_INFO_NULL) {
777         PetscMPIInt k,nkeys;
778         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];
779 
780         ierr = MPI_Info_get_nkeys(w->info,&nkeys);CHKERRQ(ierr);
781         ierr = PetscViewerASCIIPrintf(viewer,"    current info with %d keys. Ordered key-value pairs follow:\n",nkeys);CHKERRQ(ierr);
782         for (k = 0; k < nkeys; k++) {
783           PetscMPIInt flag;
784 
785           ierr = MPI_Info_get_nthkey(w->info,k,key);CHKERRQ(ierr);
786           ierr = MPI_Info_get(w->info,key,MPI_MAX_INFO_VAL,value,&flag);CHKERRQ(ierr);
787           if (!flag) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing key %s",key);
788           ierr = PetscViewerASCIIPrintf(viewer,"      %s = %s\n",key,value);CHKERRQ(ierr);
789         }
790       } else {
791         ierr = PetscViewerASCIIPrintf(viewer,"    current info=MPI_INFO_NULL\n");CHKERRQ(ierr);
792       }
793     }
794   }
795   PetscFunctionReturn(0);
796 }
797 
798 static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf,PetscSFDuplicateOption opt,PetscSF newsf)
799 {
800   PetscSF_Window        *w = (PetscSF_Window*)sf->data;
801   PetscErrorCode        ierr;
802   PetscSFWindowSyncType synctype;
803 
804   PetscFunctionBegin;
805   synctype = w->sync;
806   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
807   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
808   ierr = PetscSFWindowSetSyncType(newsf,synctype);CHKERRQ(ierr);
809   ierr = PetscSFWindowSetFlavorType(newsf,w->flavor);CHKERRQ(ierr);
810   ierr = PetscSFWindowSetInfo(newsf,w->info);CHKERRQ(ierr);
811   PetscFunctionReturn(0);
812 }
813 
814 static PetscErrorCode PetscSFBcastAndOpBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
815 {
816   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
817   PetscErrorCode     ierr;
818   PetscInt           i,nranks;
819   const PetscMPIInt  *ranks;
820   const MPI_Aint     *target_disp;
821   const MPI_Datatype *mine,*remote;
822   MPI_Win            win;
823 
824   PetscFunctionBegin;
825   if (op != MPI_REPLACE || op != MPIU_REPLACE) SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "PetscSFBcastAndOpBegin_Window with op!=MPI_REPLACE has not been implemented");
826   ierr = PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr);
827   ierr = PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);CHKERRQ(ierr);
828   ierr = PetscSFGetWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPUT|MPI_MODE_NOPRECEDE,MPI_MODE_NOPUT,0,&target_disp,&win);CHKERRQ(ierr);
829   for (i=0; i<nranks; i++) {
830     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
831 
832     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win);CHKERRQ(ierr);}
833     ierr = MPI_Get(leafdata,1,mine[i],ranks[i],tdp,1,remote[i],win);CHKERRQ(ierr);
834     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr);}
835   }
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode PetscSFBcastAndOpEnd_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
840 {
841   PetscSF_Window *w = (PetscSF_Window*)sf->data;
842   PetscErrorCode ierr;
843   MPI_Win        win;
844 
845   PetscFunctionBegin;
846   ierr = PetscSFFindWindow(sf,unit,rootdata,&win);CHKERRQ(ierr);
847   ierr = PetscSFRestoreWindow(sf,unit,(void*)rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSTORE|MPI_MODE_NOSUCCEED,PETSC_FALSE,&win);CHKERRQ(ierr);
848   PetscFunctionReturn(0);
849 }
850 
851 PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
852 {
853   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
854   PetscErrorCode     ierr;
855   PetscInt           i,nranks;
856   const PetscMPIInt  *ranks;
857   const MPI_Aint     *target_disp;
858   const MPI_Datatype *mine,*remote;
859   MPI_Win            win;
860 
861   PetscFunctionBegin;
862   ierr = PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr);
863   ierr = PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);CHKERRQ(ierr);
864   ierr = PetscSFWindowOpTranslate(&op);CHKERRQ(ierr);
865   ierr = PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,&win);CHKERRQ(ierr);
866   for (i=0; i<nranks; i++) {
867     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
868 
869     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],MPI_MODE_NOCHECK,win);CHKERRQ(ierr);}
870     ierr = MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
871     if (ierr) { /* intercept the MPI error since the combination of unit and op is not supported */
872       PetscMPIInt len;
873       char        errstring[MPI_MAX_ERROR_STRING];
874 
875       MPI_Error_string(ierr,errstring,&len);
876       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Internal error in MPI: %s",errstring);
877     }
878     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr);}
879   }
880   PetscFunctionReturn(0);
881 }
882 
883 static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
884 {
885   PetscSF_Window *w = (PetscSF_Window*)sf->data;
886   PetscErrorCode ierr;
887   MPI_Win        win;
888 
889   PetscFunctionBegin;
890   if (!w->wins) PetscFunctionReturn(0);
891   ierr = PetscSFFindWindow(sf,unit,rootdata,&win);CHKERRQ(ierr);
892   ierr = PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win);CHKERRQ(ierr);
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
897 {
898   PetscErrorCode     ierr;
899   PetscInt           i,nranks;
900   const PetscMPIInt  *ranks;
901   const MPI_Datatype *mine,*remote;
902   const MPI_Aint     *target_disp;
903   MPI_Win            win;
904   PetscSF_Window     *w = (PetscSF_Window*)sf->data;
905 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
906   PetscSFWindowFlavorType oldf;
907 #endif
908 
909   PetscFunctionBegin;
910   ierr = PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr);
911   ierr = PetscSFWindowGetDataTypes(sf,unit,&mine,&remote);CHKERRQ(ierr);
912   ierr = PetscSFWindowOpTranslate(&op);CHKERRQ(ierr);
913 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
914   /* FetchAndOp without MPI_Get_Accumulate requires locking.
915      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
916   oldf = w->flavor;
917   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
918   ierr = PetscSFGetWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,0,0,&target_disp,&win);CHKERRQ(ierr);
919 #else
920   ierr = PetscSFGetWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOPRECEDE,0,0,&target_disp,&win);CHKERRQ(ierr);
921 #endif
922   for (i=0; i<nranks; i++) {
923     MPI_Aint tdp = target_disp ? target_disp[i] : 0;
924 
925 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
926     ierr = MPI_Win_lock(MPI_LOCK_EXCLUSIVE,ranks[i],0,win);CHKERRQ(ierr);
927     ierr = MPI_Get(leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],win);CHKERRQ(ierr);
928     ierr = MPI_Accumulate((void*)leafdata,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
929     if (ierr) { /* intercept the MPI error since the combination of unit and op is not supported */
930       PetscMPIInt len;
931       char        errstring[MPI_MAX_ERROR_STRING];
932 
933       MPI_Error_string(ierr,errstring,&len);
934       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Internal error in MPI: %s",errstring);
935     }
936     ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr);
937 #else
938     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) { ierr = MPI_Win_lock(MPI_LOCK_SHARED,ranks[i],0,win);CHKERRQ(ierr); }
939     ierr = MPI_Get_accumulate((void*)leafdata,1,mine[i],leafupdate,1,mine[i],ranks[i],tdp,1,remote[i],op,win);
940     if (ierr) { /* intercept the MPI error since the combination of unit and op is not supported */
941       PetscMPIInt len;
942       char        errstring[MPI_MAX_ERROR_STRING];
943 
944       MPI_Error_string(ierr,errstring,&len);
945       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Internal error in MPI: %s",errstring);
946     }
947     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) { ierr = MPI_Win_unlock(ranks[i],win);CHKERRQ(ierr); }
948 #endif
949   }
950 #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
951   w->flavor = oldf;
952 #endif
953   PetscFunctionReturn(0);
954 }
955 
956 static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
957 {
958   PetscErrorCode ierr;
959   MPI_Win        win;
960 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
961   PetscSF_Window *w = (PetscSF_Window*)sf->data;
962 #endif
963 
964   PetscFunctionBegin;
965   ierr = PetscSFFindWindow(sf,unit,rootdata,&win);CHKERRQ(ierr);
966 #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
967   ierr = PetscSFRestoreWindow(sf,unit,rootdata,w->sync,PETSC_TRUE,MPI_MODE_NOSUCCEED,PETSC_TRUE,&win);CHKERRQ(ierr);
968 #else
969   ierr = PetscSFRestoreWindow(sf,unit,rootdata,PETSCSF_WINDOW_SYNC_LOCK,PETSC_FALSE,0,PETSC_TRUE,&win);CHKERRQ(ierr);
970 #endif
971   PetscFunctionReturn(0);
972 }
973 
974 PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
975 {
976   PetscSF_Window *w = (PetscSF_Window*)sf->data;
977   PetscErrorCode ierr;
978 
979   PetscFunctionBegin;
980   sf->ops->SetUp           = PetscSFSetUp_Window;
981   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
982   sf->ops->Reset           = PetscSFReset_Window;
983   sf->ops->Destroy         = PetscSFDestroy_Window;
984   sf->ops->View            = PetscSFView_Window;
985   sf->ops->Duplicate       = PetscSFDuplicate_Window;
986   sf->ops->BcastAndOpBegin = PetscSFBcastAndOpBegin_Window;
987   sf->ops->BcastAndOpEnd   = PetscSFBcastAndOpEnd_Window;
988   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
989   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
990   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
991   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;
992 
993   ierr = PetscNewLog(sf,&w);CHKERRQ(ierr);
994   sf->data  = (void*)w;
995   w->sync   = PETSCSF_WINDOW_SYNC_FENCE;
996   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
997   w->info   = MPI_INFO_NULL;
998 
999   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetSyncType_C",PetscSFWindowSetSyncType_Window);CHKERRQ(ierr);
1000   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetSyncType_C",PetscSFWindowGetSyncType_Window);CHKERRQ(ierr);
1001   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetFlavorType_C",PetscSFWindowSetFlavorType_Window);CHKERRQ(ierr);
1002   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetFlavorType_C",PetscSFWindowGetFlavorType_Window);CHKERRQ(ierr);
1003   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowSetInfo_C",PetscSFWindowSetInfo_Window);CHKERRQ(ierr);
1004   ierr = PetscObjectComposeFunction((PetscObject)sf,"PetscSFWindowGetInfo_C",PetscSFWindowGetInfo_Window);CHKERRQ(ierr);
1005 
1006 #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 1 || (OMPI_MAJOR_VERSION == 1 && OMPI_MINOR_VERSION <= 6))
1007   {
1008     PetscBool ackbug = PETSC_FALSE;
1009     ierr = PetscOptionsGetBool(NULL,NULL,"-acknowledge_ompi_onesided_bug",&ackbug,NULL);CHKERRQ(ierr);
1010     if (ackbug) {
1011       ierr = PetscInfo(sf,"Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n");CHKERRQ(ierr);
1012     } else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_LIB,"Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
1013   }
1014 #endif
1015   PetscFunctionReturn(0);
1016 }
1017