xref: /petsc/src/vec/is/sf/tutorials/ex1.c (revision 8fb5bd83c3955fefcf33a54e3bb66920a9fa884b)
1 static const char help[] = "Test star forest communication (PetscSF)\n\n";
2 
3 /*
4     Description: A star is a simple tree with one root and zero or more leaves.
5     A star forest is a union of disjoint stars.
6     Many common communication patterns can be expressed as updates of rootdata using leafdata and vice-versa.
7     This example creates a star forest, communicates values using the graph (see options for types of communication), views the graph, then destroys it.
8 */
9 
10 /*
11   Include petscsf.h so we can use PetscSF objects. Note that this automatically
12   includes petscsys.h.
13 */
14 #include <petscsf.h>
15 #include <petscviewer.h>
16 
17 /* like PetscSFView() but with alternative array of local indices */
18 static PetscErrorCode PetscSFViewCustomLocals_Private(PetscSF sf,const PetscInt locals[],PetscViewer viewer)
19 {
20   const PetscSFNode *iremote;
21   PetscInt          i,nroots,nleaves,nranks;
22   PetscMPIInt       rank;
23 
24   PetscFunctionBeginUser;
25   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank));
26   PetscCall(PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote));
27   PetscCall(PetscSFGetRootRanks(sf,&nranks,NULL,NULL,NULL,NULL));
28   PetscCall(PetscViewerASCIIPushTab(viewer));
29   PetscCall(PetscViewerASCIIPushSynchronized(viewer));
30   PetscCall(PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n",rank,nroots,nleaves,nranks));
31   for (i=0; i<nleaves; i++) {
32     PetscCall(PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n",rank,locals[i],iremote[i].rank,iremote[i].index));
33   }
34   PetscCall(PetscViewerFlush(viewer));
35   PetscCall(PetscViewerASCIIPopTab(viewer));
36   PetscCall(PetscViewerASCIIPopSynchronized(viewer));
37   PetscFunctionReturn(0);
38 }
39 
40 int main(int argc,char **argv)
41 {
42   PetscInt       i,nroots,nrootsalloc,nleaves,nleavesalloc,*mine,stride;
43   PetscSFNode    *remote;
44   PetscMPIInt    rank,size;
45   PetscSF        sf;
46   PetscBool      test_all,test_bcast,test_bcastop,test_reduce,test_degree,test_fetchandop,test_gather,test_scatter,test_embed,test_invert,test_sf_distribute,test_char;
47   MPI_Op         mop=MPI_OP_NULL; /* initialize to prevent compiler warnings with cxx_quad build */
48   char           opstring[256];
49   PetscBool      strflg;
50 
51   PetscCall(PetscInitialize(&argc,&argv,(char*)0,help));
52   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD,&rank));
53   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size));
54 
55   PetscOptionsBegin(PETSC_COMM_WORLD,"","PetscSF Test Options","none");
56   test_all        = PETSC_FALSE;
57   PetscCall(PetscOptionsBool("-test_all","Test all SF communications","",test_all,&test_all,NULL));
58   test_bcast      = test_all;
59   PetscCall(PetscOptionsBool("-test_bcast","Test broadcast","",test_bcast,&test_bcast,NULL));
60   test_bcastop    = test_all;
61   PetscCall(PetscOptionsBool("-test_bcastop","Test broadcast and reduce","",test_bcastop,&test_bcastop,NULL));
62   test_reduce     = test_all;
63   PetscCall(PetscOptionsBool("-test_reduce","Test reduction","",test_reduce,&test_reduce,NULL));
64   test_char       = test_all;
65   PetscCall(PetscOptionsBool("-test_char","Test signed char, unsigned char, and char","",test_char,&test_char,NULL));
66   mop             = MPI_SUM;
67   PetscCall(PetscStrcpy(opstring,"sum"));
68   PetscCall(PetscOptionsString("-test_op","Designate which MPI_Op to use","",opstring,opstring,sizeof(opstring),NULL));
69   PetscCall(PetscStrcmp("sum",opstring,&strflg));
70   if (strflg) {
71     mop = MPIU_SUM;
72   }
73   PetscCall(PetscStrcmp("prod",opstring,&strflg));
74   if (strflg) {
75     mop = MPI_PROD;
76   }
77   PetscCall(PetscStrcmp("max",opstring,&strflg));
78   if (strflg) {
79     mop = MPI_MAX;
80   }
81   PetscCall(PetscStrcmp("min",opstring,&strflg));
82   if (strflg) {
83     mop = MPI_MIN;
84   }
85   PetscCall(PetscStrcmp("land",opstring,&strflg));
86   if (strflg) {
87     mop = MPI_LAND;
88   }
89   PetscCall(PetscStrcmp("band",opstring,&strflg));
90   if (strflg) {
91     mop = MPI_BAND;
92   }
93   PetscCall(PetscStrcmp("lor",opstring,&strflg));
94   if (strflg) {
95     mop = MPI_LOR;
96   }
97   PetscCall(PetscStrcmp("bor",opstring,&strflg));
98   if (strflg) {
99     mop = MPI_BOR;
100   }
101   PetscCall(PetscStrcmp("lxor",opstring,&strflg));
102   if (strflg) {
103     mop = MPI_LXOR;
104   }
105   PetscCall(PetscStrcmp("bxor",opstring,&strflg));
106   if (strflg) {
107     mop = MPI_BXOR;
108   }
109   test_degree     = test_all;
110   PetscCall(PetscOptionsBool("-test_degree","Test computation of vertex degree","",test_degree,&test_degree,NULL));
111   test_fetchandop = test_all;
112   PetscCall(PetscOptionsBool("-test_fetchandop","Test atomic Fetch-And-Op","",test_fetchandop,&test_fetchandop,NULL));
113   test_gather     = test_all;
114   PetscCall(PetscOptionsBool("-test_gather","Test point gather","",test_gather,&test_gather,NULL));
115   test_scatter    = test_all;
116   PetscCall(PetscOptionsBool("-test_scatter","Test point scatter","",test_scatter,&test_scatter,NULL));
117   test_embed      = test_all;
118   PetscCall(PetscOptionsBool("-test_embed","Test point embed","",test_embed,&test_embed,NULL));
119   test_invert     = test_all;
120   PetscCall(PetscOptionsBool("-test_invert","Test point invert","",test_invert,&test_invert,NULL));
121   stride          = 1;
122   PetscCall(PetscOptionsInt("-stride","Stride for leaf and root data","",stride,&stride,NULL));
123   test_sf_distribute = PETSC_FALSE;
124   PetscCall(PetscOptionsBool("-test_sf_distribute","Create an SF that 'distributes' to each process, like an alltoall","",test_sf_distribute,&test_sf_distribute,NULL));
125   PetscCall(PetscOptionsString("-test_op","Designate which MPI_Op to use","",opstring,opstring,sizeof(opstring),NULL));
126   PetscOptionsEnd();
127 
128   if (test_sf_distribute) {
129     nroots = size;
130     nrootsalloc = size;
131     nleaves = size;
132     nleavesalloc = size;
133     mine = NULL;
134     PetscCall(PetscMalloc1(nleaves,&remote));
135     for (i=0; i<size; i++) {
136       remote[i].rank = i;
137       remote[i].index = rank;
138     }
139   } else {
140     nroots       = 2 + (PetscInt)(rank == 0);
141     nrootsalloc  = nroots * stride;
142     nleaves      = 2 + (PetscInt)(rank > 0);
143     nleavesalloc = nleaves * stride;
144     mine         = NULL;
145     if (stride > 1) {
146       PetscInt i;
147 
148       PetscCall(PetscMalloc1(nleaves,&mine));
149       for (i = 0; i < nleaves; i++) {
150         mine[i] = stride * i;
151       }
152     }
153     PetscCall(PetscMalloc1(nleaves,&remote));
154     /* Left periodic neighbor */
155     remote[0].rank  = (rank+size-1)%size;
156     remote[0].index = 1 * stride;
157     /* Right periodic neighbor */
158     remote[1].rank  = (rank+1)%size;
159     remote[1].index = 0 * stride;
160     if (rank > 0) {               /* All processes reference rank 0, index 1 */
161       remote[2].rank  = 0;
162       remote[2].index = 2 * stride;
163     }
164   }
165 
166   /* Create a star forest for communication. In this example, the leaf space is dense, so we pass NULL. */
167   PetscCall(PetscSFCreate(PETSC_COMM_WORLD,&sf));
168   PetscCall(PetscSFSetFromOptions(sf));
169   PetscCall(PetscSFSetGraph(sf,nrootsalloc,nleaves,mine,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER));
170   PetscCall(PetscSFSetUp(sf));
171 
172   /* View graph, mostly useful for debugging purposes. */
173   PetscCall(PetscViewerPushFormat(PETSC_VIEWER_STDOUT_WORLD,PETSC_VIEWER_ASCII_INFO_DETAIL));
174   PetscCall(PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD));
175   PetscCall(PetscViewerPopFormat(PETSC_VIEWER_STDOUT_WORLD));
176 
177   if (test_bcast) {             /* broadcast rootdata into leafdata */
178     PetscInt *rootdata,*leafdata;
179     /* Allocate space for send and receive buffers. This example communicates PetscInt, but other types, including
180      * user-defined structures, could also be used. */
181     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
182     /* Set rootdata buffer to be broadcast */
183     for (i=0; i<nrootsalloc; i++) rootdata[i] = -1;
184     for (i=0; i<nroots; i++) rootdata[i*stride] = 100*(rank+1) + i;
185     /* Initialize local buffer, these values are never used. */
186     for (i=0; i<nleavesalloc; i++) leafdata[i] = -1;
187     /* Broadcast entries from rootdata to leafdata. Computation or other communication can be performed between the begin and end calls. */
188     PetscCall(PetscSFBcastBegin(sf,MPIU_INT,rootdata,leafdata,MPI_REPLACE));
189     PetscCall(PetscSFBcastEnd(sf,MPIU_INT,rootdata,leafdata,MPI_REPLACE));
190     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Bcast Rootdata\n"));
191     PetscCall(PetscIntView(nrootsalloc,rootdata,PETSC_VIEWER_STDOUT_WORLD));
192     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Bcast Leafdata\n"));
193     PetscCall(PetscIntView(nleavesalloc,leafdata,PETSC_VIEWER_STDOUT_WORLD));
194     PetscCall(PetscFree2(rootdata,leafdata));
195   }
196 
197   if (test_bcast && test_char) { /* Bcast with char */
198     PetscInt len;
199     char buf[256];
200     char *rootdata,*leafdata;
201     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
202     /* Set rootdata buffer to be broadcast */
203     for (i=0; i<nrootsalloc; i++) rootdata[i] = '*';
204     for (i=0; i<nroots; i++) rootdata[i*stride] = 'A' + rank*3 + i; /* rank is very small, so it is fine to compute a char */
205     /* Initialize local buffer, these values are never used. */
206     for (i=0; i<nleavesalloc; i++) leafdata[i] = '?';
207 
208     PetscCall(PetscSFBcastBegin(sf,MPI_CHAR,rootdata,leafdata,MPI_REPLACE));
209     PetscCall(PetscSFBcastEnd(sf,MPI_CHAR,rootdata,leafdata,MPI_REPLACE));
210 
211     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Bcast Rootdata in type of char\n"));
212     len  = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
213     for (i=0; i<nrootsalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5c",rootdata[i])); len += 5;}
214     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
215     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
216 
217     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Bcast Leafdata in type of char\n"));
218     len = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
219     for (i=0; i<nleavesalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5c",leafdata[i])); len += 5;}
220     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
221     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
222 
223     PetscCall(PetscFree2(rootdata,leafdata));
224   }
225 
226   if (test_bcastop) {         /* Reduce rootdata into leafdata */
227     PetscInt *rootdata,*leafdata;
228     /* Allocate space for send and receive buffers. This example communicates PetscInt, but other types, including
229      * user-defined structures, could also be used. */
230     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
231     /* Set rootdata buffer to be broadcast */
232     for (i=0; i<nrootsalloc; i++) rootdata[i] = -1;
233     for (i=0; i<nroots; i++) rootdata[i*stride] = 100*(rank+1) + i;
234     /* Set leaf values to reduce with */
235     for (i=0; i<nleavesalloc; i++) leafdata[i] = -10*(rank+1) - i;
236     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Pre-BcastAndOp Leafdata\n"));
237     PetscCall(PetscIntView(nleavesalloc,leafdata,PETSC_VIEWER_STDOUT_WORLD));
238     /* Broadcast entries from rootdata to leafdata. Computation or other communication can be performed between the begin and end calls. */
239     PetscCall(PetscSFBcastBegin(sf,MPIU_INT,rootdata,leafdata,mop));
240     PetscCall(PetscSFBcastEnd(sf,MPIU_INT,rootdata,leafdata,mop));
241     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## BcastAndOp Rootdata\n"));
242     PetscCall(PetscIntView(nrootsalloc,rootdata,PETSC_VIEWER_STDOUT_WORLD));
243     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## BcastAndOp Leafdata\n"));
244     PetscCall(PetscIntView(nleavesalloc,leafdata,PETSC_VIEWER_STDOUT_WORLD));
245     PetscCall(PetscFree2(rootdata,leafdata));
246   }
247 
248   if (test_reduce) {            /* Reduce leafdata into rootdata */
249     PetscInt *rootdata,*leafdata;
250     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
251     /* Initialize rootdata buffer in which the result of the reduction will appear. */
252     for (i=0; i<nrootsalloc; i++) rootdata[i] = -1;
253     for (i=0; i<nroots; i++) rootdata[i*stride] = 100*(rank+1) + i;
254     /* Set leaf values to reduce. */
255     for (i=0; i<nleavesalloc; i++) leafdata[i] = -1;
256     for (i=0; i<nleaves; i++) leafdata[i*stride] = 1000*(rank+1) + 10*i;
257     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Pre-Reduce Rootdata\n"));
258     PetscCall(PetscIntView(nrootsalloc,rootdata,PETSC_VIEWER_STDOUT_WORLD));
259     /* Perform reduction. Computation or other communication can be performed between the begin and end calls.
260      * This example sums the values, but other MPI_Ops can be used (e.g MPI_MAX, MPI_PROD). */
261     PetscCall(PetscSFReduceBegin(sf,MPIU_INT,leafdata,rootdata,mop));
262     PetscCall(PetscSFReduceEnd(sf,MPIU_INT,leafdata,rootdata,mop));
263     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Leafdata\n"));
264     PetscCall(PetscIntView(nleavesalloc,leafdata,PETSC_VIEWER_STDOUT_WORLD));
265     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Rootdata\n"));
266     PetscCall(PetscIntView(nrootsalloc,rootdata,PETSC_VIEWER_STDOUT_WORLD));
267     PetscCall(PetscFree2(rootdata,leafdata));
268   }
269 
270   if (test_reduce && test_char) { /* Reduce with signed char */
271     PetscInt len;
272     char buf[256];
273     signed char *rootdata,*leafdata;
274     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
275     /* Initialize rootdata buffer in which the result of the reduction will appear. */
276     for (i=0; i<nrootsalloc; i++) rootdata[i] = -1;
277     for (i=0; i<nroots; i++) rootdata[i*stride] = 10*(rank+1) + i;
278     /* Set leaf values to reduce. */
279     for (i=0; i<nleavesalloc; i++) leafdata[i] = -1;
280     for (i=0; i<nleaves; i++) leafdata[i*stride] = 50*(rank+1) + 10*i;
281     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Pre-Reduce Rootdata in type of signed char\n"));
282 
283     len = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
284     for (i=0; i<nrootsalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5d",rootdata[i])); len += 5;}
285     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
286     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
287 
288     /* Using MPI_CHAR should trigger an error since MPI standard does not support reduction on MPI_CHAR.
289        Testing with -test_op max, one can see the sign does take effect in MPI_MAX.
290      */
291     PetscCall(PetscSFReduceBegin(sf,MPI_SIGNED_CHAR,leafdata,rootdata,mop));
292     PetscCall(PetscSFReduceEnd(sf,MPI_SIGNED_CHAR,leafdata,rootdata,mop));
293 
294     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Leafdata in type of signed char\n"));
295     len  = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
296     for (i=0; i<nleavesalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5d",leafdata[i])); len += 5;}
297     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
298     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
299 
300     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Rootdata in type of signed char\n"));
301     len = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
302     for (i=0; i<nrootsalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5d",rootdata[i])); len += 5;}
303     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
304     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
305 
306     PetscCall(PetscFree2(rootdata,leafdata));
307   }
308 
309   if (test_reduce && test_char) { /* Reduce with unsigned char */
310     PetscInt len;
311     char buf[256];
312     unsigned char *rootdata,*leafdata;
313     PetscCall(PetscMalloc2(nrootsalloc,&rootdata,nleavesalloc,&leafdata));
314     /* Initialize rootdata buffer in which the result of the reduction will appear. */
315     for (i=0; i<nrootsalloc; i++) rootdata[i] = 0;
316     for (i=0; i<nroots; i++) rootdata[i*stride] = 10*(rank+1) + i;
317     /* Set leaf values to reduce. */
318     for (i=0; i<nleavesalloc; i++) leafdata[i] = 0;
319     for (i=0; i<nleaves; i++) leafdata[i*stride] = 50*(rank+1) + 10*i;
320     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Pre-Reduce Rootdata in type of unsigned char\n"));
321 
322     len = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
323     for (i=0; i<nrootsalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5u",rootdata[i])); len += 5;}
324     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
325     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
326 
327     /* Using MPI_CHAR should trigger an error since MPI standard does not support reduction on MPI_CHAR.
328        Testing with -test_op max, one can see the sign does take effect in MPI_MAX.
329      */
330     PetscCall(PetscSFReduceBegin(sf,MPI_UNSIGNED_CHAR,leafdata,rootdata,mop));
331     PetscCall(PetscSFReduceEnd(sf,MPI_UNSIGNED_CHAR,leafdata,rootdata,mop));
332 
333     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Leafdata in type of unsigned char\n"));
334     len  = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
335     for (i=0; i<nleavesalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5u",leafdata[i])); len += 5;}
336     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
337     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
338 
339     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Reduce Rootdata in type of unsigned char\n"));
340     len = 0; PetscCall(PetscSNPrintf(buf,256,"%4d:",rank)); len += 5;
341     for (i=0; i<nrootsalloc; i++) {PetscCall(PetscSNPrintf(buf+len,256-len,"%5u",rootdata[i])); len += 5;}
342     PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%s\n",buf));
343     PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT));
344 
345     PetscCall(PetscFree2(rootdata,leafdata));
346   }
347 
348   if (test_degree) {
349     const PetscInt *degree;
350     PetscCall(PetscSFComputeDegreeBegin(sf,&degree));
351     PetscCall(PetscSFComputeDegreeEnd(sf,&degree));
352     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Root degrees\n"));
353     PetscCall(PetscIntView(nrootsalloc,degree,PETSC_VIEWER_STDOUT_WORLD));
354   }
355 
356   if (test_fetchandop) {
357     /* Cannot use text compare here because token ordering is not deterministic */
358     PetscInt *leafdata,*leafupdate,*rootdata;
359     PetscCall(PetscMalloc3(nleavesalloc,&leafdata,nleavesalloc,&leafupdate,nrootsalloc,&rootdata));
360     for (i=0; i<nleavesalloc; i++) leafdata[i] = -1;
361     for (i=0; i<nleaves; i++) leafdata[i*stride] = 1;
362     for (i=0; i<nrootsalloc; i++) rootdata[i] = -1;
363     for (i=0; i<nroots; i++) rootdata[i*stride] = 0;
364     PetscCall(PetscSFFetchAndOpBegin(sf,MPIU_INT,rootdata,leafdata,leafupdate,mop));
365     PetscCall(PetscSFFetchAndOpEnd(sf,MPIU_INT,rootdata,leafdata,leafupdate,mop));
366     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Rootdata (sum of 1 from each leaf)\n"));
367     PetscCall(PetscIntView(nrootsalloc,rootdata,PETSC_VIEWER_STDOUT_WORLD));
368     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Leafupdate (value at roots prior to my atomic update)\n"));
369     PetscCall(PetscIntView(nleavesalloc,leafupdate,PETSC_VIEWER_STDOUT_WORLD));
370     PetscCall(PetscFree3(leafdata,leafupdate,rootdata));
371   }
372 
373   if (test_gather) {
374     const PetscInt *degree;
375     PetscInt       inedges,*indata,*outdata;
376     PetscCall(PetscSFComputeDegreeBegin(sf,&degree));
377     PetscCall(PetscSFComputeDegreeEnd(sf,&degree));
378     for (i=0,inedges=0; i<nrootsalloc; i++) inedges += degree[i];
379     PetscCall(PetscMalloc2(inedges,&indata,nleavesalloc,&outdata));
380     for (i=0; i<nleavesalloc; i++) outdata[i] = -1;
381     for (i=0; i<nleaves; i++) outdata[i*stride] = 1000*(rank+1) + i;
382     PetscCall(PetscSFGatherBegin(sf,MPIU_INT,outdata,indata));
383     PetscCall(PetscSFGatherEnd(sf,MPIU_INT,outdata,indata));
384     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Gathered data at multi-roots from leaves\n"));
385     PetscCall(PetscIntView(inedges,indata,PETSC_VIEWER_STDOUT_WORLD));
386     PetscCall(PetscFree2(indata,outdata));
387   }
388 
389   if (test_scatter) {
390     const PetscInt *degree;
391     PetscInt       j,count,inedges,*indata,*outdata;
392     PetscCall(PetscSFComputeDegreeBegin(sf,&degree));
393     PetscCall(PetscSFComputeDegreeEnd(sf,&degree));
394     for (i=0,inedges=0; i<nrootsalloc; i++) inedges += degree[i];
395     PetscCall(PetscMalloc2(inedges,&indata,nleavesalloc,&outdata));
396     for (i=0; i<nleavesalloc; i++) outdata[i] = -1;
397     for (i=0,count=0; i<nrootsalloc; i++) {
398       for (j=0; j<degree[i]; j++) indata[count++] = 1000*(rank+1) + 100*i + j;
399     }
400     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Data at multi-roots, to scatter to leaves\n"));
401     PetscCall(PetscIntView(inedges,indata,PETSC_VIEWER_STDOUT_WORLD));
402 
403     PetscCall(PetscSFScatterBegin(sf,MPIU_INT,indata,outdata));
404     PetscCall(PetscSFScatterEnd(sf,MPIU_INT,indata,outdata));
405     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Scattered data at leaves\n"));
406     PetscCall(PetscIntView(nleavesalloc,outdata,PETSC_VIEWER_STDOUT_WORLD));
407     PetscCall(PetscFree2(indata,outdata));
408   }
409 
410   if (test_embed) {
411     const PetscInt nroots = 1 + (PetscInt) (rank == 0);
412     PetscInt       selected[2];
413     PetscSF        esf;
414 
415     selected[0] = stride;
416     selected[1] = 2*stride;
417     PetscCall(PetscSFCreateEmbeddedRootSF(sf,nroots,selected,&esf));
418     PetscCall(PetscSFSetUp(esf));
419     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Embedded PetscSF\n"));
420     PetscCall(PetscViewerPushFormat(PETSC_VIEWER_STDOUT_WORLD,PETSC_VIEWER_ASCII_INFO_DETAIL));
421     PetscCall(PetscSFView(esf,PETSC_VIEWER_STDOUT_WORLD));
422     PetscCall(PetscViewerPopFormat(PETSC_VIEWER_STDOUT_WORLD));
423     PetscCall(PetscSFDestroy(&esf));
424   }
425 
426   if (test_invert) {
427     const PetscInt *degree;
428     PetscInt *mRootsOrigNumbering;
429     PetscInt inedges;
430     PetscSF msf,imsf;
431 
432     PetscCall(PetscSFGetMultiSF(sf,&msf));
433     PetscCall(PetscSFCreateInverseSF(msf,&imsf));
434     PetscCall(PetscSFSetUp(msf));
435     PetscCall(PetscSFSetUp(imsf));
436     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Multi-SF\n"));
437     PetscCall(PetscSFView(msf,PETSC_VIEWER_STDOUT_WORLD));
438     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Multi-SF roots indices in original SF roots numbering\n"));
439     PetscCall(PetscSFComputeDegreeBegin(sf,&degree));
440     PetscCall(PetscSFComputeDegreeEnd(sf,&degree));
441     PetscCall(PetscSFComputeMultiRootOriginalNumbering(sf,degree,&inedges,&mRootsOrigNumbering));
442     PetscCall(PetscIntView(inedges,mRootsOrigNumbering,PETSC_VIEWER_STDOUT_WORLD));
443     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Inverse of Multi-SF\n"));
444     PetscCall(PetscSFView(imsf,PETSC_VIEWER_STDOUT_WORLD));
445     PetscCall(PetscViewerASCIIPrintf(PETSC_VIEWER_STDOUT_WORLD,"## Inverse of Multi-SF, original numbering\n"));
446     PetscCall(PetscSFViewCustomLocals_Private(imsf,mRootsOrigNumbering,PETSC_VIEWER_STDOUT_WORLD));
447     PetscCall(PetscSFDestroy(&imsf));
448     PetscCall(PetscFree(mRootsOrigNumbering));
449   }
450 
451   /* Clean storage for star forest. */
452   PetscCall(PetscSFDestroy(&sf));
453   PetscCall(PetscFinalize());
454   return 0;
455 }
456 
457 /*TEST
458 
459    test:
460       nsize: 4
461       filter: grep -v "type" | grep -v "sort"
462       args: -test_bcast -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
463       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
464 
465    test:
466       suffix: 2
467       nsize: 4
468       filter: grep -v "type" | grep -v "sort"
469       args: -test_reduce -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
470       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
471 
472    test:
473       suffix: 2_basic
474       nsize: 4
475       args: -test_reduce -sf_type basic
476 
477    test:
478       suffix: 3
479       nsize: 4
480       filter: grep -v "type" | grep -v "sort"
481       args: -test_degree -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
482       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
483 
484    test:
485       suffix: 3_basic
486       nsize: 4
487       args: -test_degree -sf_type basic
488 
489    test:
490       suffix: 4
491       nsize: 4
492       filter: grep -v "type" | grep -v "sort"
493       args: -test_gather -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
494       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
495 
496    test:
497       suffix: 4_basic
498       nsize: 4
499       args: -test_gather -sf_type basic
500 
501    test:
502       suffix: 4_stride
503       nsize: 4
504       args: -test_gather -sf_type basic -stride 2
505 
506    test:
507       suffix: 5
508       nsize: 4
509       filter: grep -v "type" | grep -v "sort"
510       args: -test_scatter -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
511       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
512 
513    test:
514       suffix: 5_basic
515       nsize: 4
516       args: -test_scatter -sf_type basic
517 
518    test:
519       suffix: 5_stride
520       nsize: 4
521       args: -test_scatter -sf_type basic -stride 2
522 
523    test:
524       suffix: 6
525       nsize: 4
526       filter: grep -v "type" | grep -v "sort"
527       # No -sf_window_flavor dynamic due to bug https://gitlab.com/petsc/petsc/issues/555
528       args: -test_embed -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create allocate}}
529       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
530 
531    test:
532       suffix: 6_basic
533       nsize: 4
534       args: -test_embed -sf_type basic
535 
536    test:
537       suffix: 7
538       nsize: 4
539       filter: grep -v "type" | grep -v "sort"
540       args: -test_invert -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
541       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
542 
543    test:
544       suffix: 7_basic
545       nsize: 4
546       args: -test_invert -sf_type basic
547 
548    test:
549       suffix: basic
550       nsize: 4
551       args: -test_bcast -sf_type basic
552       output_file: output/ex1_1_basic.out
553 
554    test:
555       suffix: bcastop_basic
556       nsize: 4
557       args: -test_bcastop -sf_type basic
558       output_file: output/ex1_bcastop_basic.out
559 
560    test:
561       suffix: 8
562       nsize: 3
563       filter: grep -v "type" | grep -v "sort"
564       args: -test_bcast -test_sf_distribute -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create dynamic allocate}}
565       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
566 
567    test:
568       suffix: 8_basic
569       nsize: 3
570       args: -test_bcast -test_sf_distribute -sf_type basic
571 
572    test:
573       suffix: 9_char
574       nsize: 4
575       args: -sf_type basic -test_bcast -test_reduce -test_op max -test_char
576 
577    # Here we do not test -sf_window_flavor dynamic since it is designed for repeated SFs with few different rootdata pointers
578    test:
579       suffix: 10
580       filter: grep -v "type" | grep -v "sort"
581       nsize: 4
582       args: -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor {{create allocate}} -test_all -test_bcastop 0 -test_fetchandop 0
583       requires: defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
584 
585    # The nightly test suite with MPICH uses ch3:sock, which is broken when winsize == 0 in some of the processes
586    test:
587       suffix: 10_shared
588       output_file: output/ex1_10.out
589       filter: grep -v "type" | grep -v "sort"
590       nsize: 4
591       args: -sf_type window -sf_window_sync {{fence active lock}} -sf_window_flavor shared -test_all -test_bcastop 0 -test_fetchandop 0
592       requires: defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) !defined(PETSC_HAVE_MPICH_NUMVERSION) defined(PETSC_HAVE_MPI_ONE_SIDED) defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
593 
594    test:
595       suffix: 10_basic
596       nsize: 4
597       args: -sf_type basic -test_all -test_bcastop 0 -test_fetchandop 0
598 
599 TEST*/
600