xref: /petsc/src/vec/vec/utils/comb.c (revision ae1ee55146a7ad071171b861759b85940c7e5c67)
1 /*
2       Split phase global vector reductions with support for combining the
3    communication portion of several operations. Using MPI-1.1 support only
4 
5       The idea for this and much of the initial code is contributed by
6    Victor Eijkhout.
7 
8        Usage:
9              VecDotBegin(Vec,Vec,PetscScalar *);
10              VecNormBegin(Vec,NormType,PetscReal *);
11              ....
12              VecDotEnd(Vec,Vec,PetscScalar *);
13              VecNormEnd(Vec,NormType,PetscReal *);
14 
15        Limitations:
16          - The order of the xxxEnd() functions MUST be in the same order
17            as the xxxBegin(). There is extensive error checking to try to
18            insure that the user calls the routines in the correct order
19 */
20 
21 #include <petsc/private/vecimpl.h> /*I   "petscvec.h"    I*/
22 
MPIU_Iallreduce(void * sendbuf,void * recvbuf,PetscMPIInt count,MPI_Datatype datatype,MPI_Op op,MPI_Comm comm,MPI_Request * request)23 static PetscMPIInt MPIU_Iallreduce(void *sendbuf, void *recvbuf, PetscMPIInt count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
24 {
25   PetscMPIInt err;
26 
27 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
28   err = MPI_Iallreduce(sendbuf, recvbuf, count, datatype, op, comm, request);
29 #else
30   err      = MPIU_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
31   *request = MPI_REQUEST_NULL;
32 #endif
33   return err;
34 }
35 
36 static PetscErrorCode PetscSplitReductionApply(PetscSplitReduction *);
37 
38 /*
39    PetscSplitReductionCreate - Creates a data structure to contain the queued information.
40 */
PetscSplitReductionCreate(MPI_Comm comm,PetscSplitReduction ** sr)41 static PetscErrorCode PetscSplitReductionCreate(MPI_Comm comm, PetscSplitReduction **sr)
42 {
43   PetscFunctionBegin;
44   PetscCall(PetscNew(sr));
45   (*sr)->numopsbegin = 0;
46   (*sr)->numopsend   = 0;
47   (*sr)->state       = STATE_BEGIN;
48 #define MAXOPS 32
49   (*sr)->maxops = MAXOPS;
50   PetscCall(PetscMalloc6(MAXOPS, &(*sr)->lvalues, MAXOPS, &(*sr)->gvalues, MAXOPS, &(*sr)->invecs, MAXOPS, &(*sr)->reducetype, MAXOPS, &(*sr)->lvalues_mix, MAXOPS, &(*sr)->gvalues_mix));
51 #undef MAXOPS
52   (*sr)->comm    = comm;
53   (*sr)->request = MPI_REQUEST_NULL;
54   (*sr)->mix     = PETSC_FALSE;
55   (*sr)->async   = PetscDefined(HAVE_MPI_NONBLOCKING_COLLECTIVES) ? PETSC_TRUE : PETSC_FALSE; /* Enable by default */
56   /* always check for option; so that tests that run on systems without support don't warn about unhandled options */
57   PetscCall(PetscOptionsGetBool(NULL, NULL, "-splitreduction_async", &(*sr)->async, NULL));
58   PetscFunctionReturn(PETSC_SUCCESS);
59 }
60 
61 /*
62        This function is the MPI reduction operation used when there is
63    a combination of sums and max in the reduction. The call below to
64    MPI_Op_create() converts the function PetscSplitReduction_Local() to the
65    MPI operator PetscSplitReduction_Op.
66 */
67 MPI_Op PetscSplitReduction_Op = 0;
68 
PetscSplitReduction_Local(void * in,void * out,PetscMPIInt * cnt,MPI_Datatype * datatype)69 PETSC_EXTERN void MPIAPI PetscSplitReduction_Local(void *in, void *out, PetscMPIInt *cnt, MPI_Datatype *datatype)
70 {
71   struct PetscScalarInt {
72     PetscScalar v;
73     PetscInt    i;
74   };
75   struct PetscScalarInt *xin  = (struct PetscScalarInt *)in;
76   struct PetscScalarInt *xout = (struct PetscScalarInt *)out;
77   PetscInt               i, count = *cnt;
78 
79   PetscFunctionBegin;
80   if (*datatype != MPIU_SCALAR_INT) {
81     PetscCallAbort(MPI_COMM_SELF, (*PetscErrorPrintf)("Can only handle MPIU_SCALAR_INT data types"));
82     PETSCABORT(MPI_COMM_SELF, PETSC_ERR_ARG_WRONG);
83   }
84   for (i = 0; i < count; i++) {
85     if (xin[i].i == PETSC_SR_REDUCE_SUM) xout[i].v += xin[i].v;
86     else if (xin[i].i == PETSC_SR_REDUCE_MAX) xout[i].v = PetscMax(PetscRealPart(xout[i].v), PetscRealPart(xin[i].v));
87     else if (xin[i].i == PETSC_SR_REDUCE_MIN) xout[i].v = PetscMin(PetscRealPart(xout[i].v), PetscRealPart(xin[i].v));
88     else {
89       PetscCallAbort(MPI_COMM_SELF, (*PetscErrorPrintf)("Reduction type input is not PETSC_SR_REDUCE_SUM, PETSC_SR_REDUCE_MAX, or PETSC_SR_REDUCE_MIN"));
90       PETSCABORT(MPI_COMM_SELF, PETSC_ERR_ARG_WRONG);
91     }
92   }
93   PetscFunctionReturnVoid();
94 }
95 
96 /*@
97   PetscCommSplitReductionBegin - Begin an asynchronous split-mode reduction
98 
99   Collective but not synchronizing
100 
101   Input Parameter:
102 . comm - communicator on which split reduction has been queued
103 
104   Level: advanced
105 
106   Note:
107   Calling this function is optional when using split-mode reduction. On supporting hardware,
108   calling this after all VecXxxBegin() allows the reduction to make asynchronous progress
109   before the result is needed (in VecXxxEnd()).
110 
111 .seealso: `VecNormBegin()`, `VecNormEnd()`, `VecDotBegin()`, `VecDotEnd()`, `VecTDotBegin()`, `VecTDotEnd()`, `VecMDotBegin()`, `VecMDotEnd()`, `VecMTDotBegin()`, `VecMTDotEnd()`
112 @*/
PetscCommSplitReductionBegin(MPI_Comm comm)113 PetscErrorCode PetscCommSplitReductionBegin(MPI_Comm comm)
114 {
115   PetscSplitReduction *sr;
116 
117   PetscFunctionBegin;
118   if (PetscDefined(HAVE_THREADSAFETY)) PetscFunctionReturn(PETSC_SUCCESS);
119   PetscCall(PetscSplitReductionGet(comm, &sr));
120   PetscCheck(sr->numopsend <= 0, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Cannot call this after VecxxxEnd() has been called");
121   if (sr->async) { /* Bad reuse, setup code copied from PetscSplitReductionApply(). */
122     PetscMPIInt           numops     = sr->numopsbegin;
123     PetscSRReductionType *reducetype = sr->reducetype;
124     PetscScalar          *lvalues = sr->lvalues, *gvalues = sr->gvalues;
125     PetscInt              sum_flg = 0, max_flg = 0, min_flg = 0;
126     MPI_Comm              comm = sr->comm;
127     PetscMPIInt           size, cmul = sizeof(PetscScalar) / sizeof(PetscReal);
128 
129     PetscCall(PetscLogEventBegin(VEC_ReduceBegin, 0, 0, 0, 0));
130     PetscCallMPI(MPI_Comm_size(sr->comm, &size));
131     if (size == 1) {
132       PetscCall(PetscArraycpy(gvalues, lvalues, numops));
133     } else {
134       /* determine if all reductions are sum, max, or min */
135       for (PetscMPIInt i = 0; i < numops; i++) {
136         if (reducetype[i] == PETSC_SR_REDUCE_MAX) max_flg = 1;
137         else if (reducetype[i] == PETSC_SR_REDUCE_SUM) sum_flg = 1;
138         else if (reducetype[i] == PETSC_SR_REDUCE_MIN) min_flg = 1;
139         else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in PetscSplitReduction() data structure, probably memory corruption");
140       }
141       PetscCheck(sum_flg + max_flg + min_flg <= 1 || !sr->mix, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in PetscSplitReduction() data structure, probably memory corruption");
142       if (sum_flg + max_flg + min_flg > 1) {
143         sr->mix = PETSC_TRUE;
144         for (PetscMPIInt i = 0; i < numops; i++) {
145           sr->lvalues_mix[i].v = lvalues[i];
146           sr->lvalues_mix[i].i = reducetype[i];
147         }
148         PetscCallMPI(MPIU_Iallreduce(sr->lvalues_mix, sr->gvalues_mix, numops, MPIU_SCALAR_INT, PetscSplitReduction_Op, comm, &sr->request));
149       } else if (max_flg) { /* Compute max of real and imag parts separately, presumably only the real part is used */
150         PetscCallMPI(MPIU_Iallreduce(lvalues, gvalues, cmul * numops, MPIU_REAL, MPIU_MAX, comm, &sr->request));
151       } else if (min_flg) {
152         PetscCallMPI(MPIU_Iallreduce(lvalues, gvalues, cmul * numops, MPIU_REAL, MPIU_MIN, comm, &sr->request));
153       } else {
154         PetscCallMPI(MPIU_Iallreduce(lvalues, gvalues, numops, MPIU_SCALAR, MPIU_SUM, comm, &sr->request));
155       }
156     }
157     sr->state     = STATE_PENDING;
158     sr->numopsend = 0;
159     PetscCall(PetscLogEventEnd(VEC_ReduceBegin, 0, 0, 0, 0));
160   } else {
161     PetscCall(PetscSplitReductionApply(sr));
162   }
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
PetscSplitReductionEnd(PetscSplitReduction * sr)166 PetscErrorCode PetscSplitReductionEnd(PetscSplitReduction *sr)
167 {
168   PetscFunctionBegin;
169   switch (sr->state) {
170   case STATE_BEGIN: /* We are doing synchronous communication and this is the first call to VecXxxEnd() so do the communication */
171     PetscCall(PetscSplitReductionApply(sr));
172     break;
173   case STATE_PENDING:
174     /* We are doing asynchronous-mode communication and this is the first VecXxxEnd() so wait for comm to complete */
175     PetscCall(PetscLogEventBegin(VEC_ReduceEnd, 0, 0, 0, 0));
176     if (sr->request != MPI_REQUEST_NULL) PetscCallMPI(MPI_Wait(&sr->request, MPI_STATUS_IGNORE));
177     sr->state = STATE_END;
178     if (sr->mix) {
179       for (PetscMPIInt i = 0; i < sr->numopsbegin; i++) sr->gvalues[i] = sr->gvalues_mix[i].v;
180       sr->mix = PETSC_FALSE;
181     }
182     PetscCall(PetscLogEventEnd(VEC_ReduceEnd, 0, 0, 0, 0));
183     break;
184   default:
185     break; /* everything is already done */
186   }
187   PetscFunctionReturn(PETSC_SUCCESS);
188 }
189 
190 /*
191    PetscSplitReductionApply - Actually do the communication required for a split phase reduction
192 */
PetscSplitReductionApply(PetscSplitReduction * sr)193 static PetscErrorCode PetscSplitReductionApply(PetscSplitReduction *sr)
194 {
195   PetscMPIInt           numops     = sr->numopsbegin;
196   PetscSRReductionType *reducetype = sr->reducetype;
197   PetscScalar          *lvalues = sr->lvalues, *gvalues = sr->gvalues;
198   PetscInt              sum_flg = 0, max_flg = 0, min_flg = 0;
199   MPI_Comm              comm = sr->comm;
200   PetscMPIInt           size, cmul = sizeof(PetscScalar) / sizeof(PetscReal);
201 
202   PetscFunctionBegin;
203   PetscCheck(sr->numopsend <= 0, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Cannot call this after VecxxxEnd() has been called");
204   PetscCall(PetscLogEventBegin(VEC_ReduceCommunication, 0, 0, 0, 0));
205   PetscCallMPI(MPI_Comm_size(sr->comm, &size));
206   if (size == 1) {
207     PetscCall(PetscArraycpy(gvalues, lvalues, numops));
208   } else {
209     /* determine if all reductions are sum, max, or min */
210     for (PetscMPIInt i = 0; i < numops; i++) {
211       if (reducetype[i] == PETSC_SR_REDUCE_MAX) max_flg = 1;
212       else if (reducetype[i] == PETSC_SR_REDUCE_SUM) sum_flg = 1;
213       else if (reducetype[i] == PETSC_SR_REDUCE_MIN) min_flg = 1;
214       else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in PetscSplitReduction() data structure, probably memory corruption");
215     }
216     if (sum_flg + max_flg + min_flg > 1) {
217       PetscCheck(!sr->mix, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in PetscSplitReduction() data structure, probably memory corruption");
218       for (PetscMPIInt i = 0; i < numops; i++) {
219         sr->lvalues_mix[i].v = lvalues[i];
220         sr->lvalues_mix[i].i = reducetype[i];
221       }
222       PetscCallMPI(MPIU_Allreduce(sr->lvalues_mix, sr->gvalues_mix, numops, MPIU_SCALAR_INT, PetscSplitReduction_Op, comm));
223       for (PetscMPIInt i = 0; i < numops; i++) sr->gvalues[i] = sr->gvalues_mix[i].v;
224     } else if (max_flg) { /* Compute max of real and imag parts separately, presumably only the real part is used */
225       PetscCallMPI(MPIU_Allreduce(lvalues, gvalues, cmul * numops, MPIU_REAL, MPIU_MAX, comm));
226     } else if (min_flg) {
227       PetscCallMPI(MPIU_Allreduce(lvalues, gvalues, cmul * numops, MPIU_REAL, MPIU_MIN, comm));
228     } else {
229       PetscCallMPI(MPIU_Allreduce(lvalues, gvalues, numops, MPIU_SCALAR, MPIU_SUM, comm));
230     }
231   }
232   sr->state     = STATE_END;
233   sr->numopsend = 0;
234   PetscCall(PetscLogEventEnd(VEC_ReduceCommunication, 0, 0, 0, 0));
235   PetscFunctionReturn(PETSC_SUCCESS);
236 }
237 
238 /*
239    PetscSplitReductionExtend - Double the amount of space (slots) allocated for a split reduction object.
240 */
PetscSplitReductionExtend(PetscSplitReduction * sr)241 PetscErrorCode PetscSplitReductionExtend(PetscSplitReduction *sr)
242 {
243   struct PetscScalarInt {
244     PetscScalar v;
245     PetscInt    i;
246   };
247   PetscMPIInt            maxops     = sr->maxops;
248   PetscSRReductionType  *reducetype = sr->reducetype;
249   PetscScalar           *lvalues = sr->lvalues, *gvalues = sr->gvalues;
250   struct PetscScalarInt *lvalues_mix = (struct PetscScalarInt *)sr->lvalues_mix;
251   struct PetscScalarInt *gvalues_mix = (struct PetscScalarInt *)sr->gvalues_mix;
252   void                 **invecs      = sr->invecs;
253 
254   PetscFunctionBegin;
255   sr->maxops = 2 * maxops;
256   PetscCall(PetscMalloc6(2 * maxops, &sr->lvalues, 2 * maxops, &sr->gvalues, 2 * maxops, &sr->reducetype, 2 * maxops, &sr->invecs, 2 * maxops, &sr->lvalues_mix, 2 * maxops, &sr->gvalues_mix));
257   PetscCall(PetscArraycpy(sr->lvalues, lvalues, maxops));
258   PetscCall(PetscArraycpy(sr->gvalues, gvalues, maxops));
259   PetscCall(PetscArraycpy(sr->reducetype, reducetype, maxops));
260   PetscCall(PetscArraycpy(sr->invecs, invecs, maxops));
261   PetscCall(PetscArraycpy(sr->lvalues_mix, lvalues_mix, maxops));
262   PetscCall(PetscArraycpy(sr->gvalues_mix, gvalues_mix, maxops));
263   PetscCall(PetscFree6(lvalues, gvalues, reducetype, invecs, lvalues_mix, gvalues_mix));
264   PetscFunctionReturn(PETSC_SUCCESS);
265 }
266 
PetscSplitReductionDestroy(PetscSplitReduction * sr)267 static PetscErrorCode PetscSplitReductionDestroy(PetscSplitReduction *sr)
268 {
269   PetscFunctionBegin;
270   PetscCall(PetscFree6(sr->lvalues, sr->gvalues, sr->reducetype, sr->invecs, sr->lvalues_mix, sr->gvalues_mix));
271   PetscCall(PetscFree(sr));
272   PetscFunctionReturn(PETSC_SUCCESS);
273 }
274 
275 PetscMPIInt Petsc_Reduction_keyval = MPI_KEYVAL_INVALID;
276 
277 /*
278    Private routine to delete internal storage when a communicator is freed.
279   This is called by MPI, not by users.
280 
281   The binding for the first argument changed from MPI 1.0 to 1.1; in 1.0
282   it was MPI_Comm *comm.
283 */
Petsc_DelReduction(MPI_Comm comm,PETSC_UNUSED PetscMPIInt keyval,void * attr_val,PETSC_UNUSED void * extra_state)284 static PetscMPIInt MPIAPI Petsc_DelReduction(MPI_Comm comm, PETSC_UNUSED PetscMPIInt keyval, void *attr_val, PETSC_UNUSED void *extra_state)
285 {
286   PetscFunctionBegin;
287   PetscCallReturnMPI(PetscInfo(0, "Deleting reduction data in an MPI_Comm %ld\n", (long)comm));
288   PetscCallReturnMPI(PetscSplitReductionDestroy((PetscSplitReduction *)attr_val));
289   PetscFunctionReturn(PETSC_SUCCESS);
290 }
291 
292 /*
293      PetscSplitReductionGet - Gets the split reduction object from a
294         PETSc vector, creates if it does not exit.
295 
296 */
PetscSplitReductionGet(MPI_Comm comm,PetscSplitReduction ** sr)297 PetscErrorCode PetscSplitReductionGet(MPI_Comm comm, PetscSplitReduction **sr)
298 {
299   PetscMPIInt iflg;
300 
301   PetscFunctionBegin;
302   PetscCheck(!PetscDefined(HAVE_THREADSAFETY), comm, PETSC_ERR_SUP, "PetscSplitReductionGet() is not thread-safe");
303   if (Petsc_Reduction_keyval == MPI_KEYVAL_INVALID) {
304     /*
305        The calling sequence of the 2nd argument to this function changed
306        between MPI Standard 1.0 and the revisions 1.1 Here we match the
307        new standard, if you are using an MPI implementation that uses
308        the older version you will get a warning message about the next line;
309        it is only a warning message and should do no harm.
310     */
311     PetscCallMPI(MPI_Comm_create_keyval(MPI_COMM_NULL_COPY_FN, Petsc_DelReduction, &Petsc_Reduction_keyval, NULL));
312   }
313   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Reduction_keyval, (void **)sr, &iflg));
314   if (!iflg) { /* doesn't exist yet so create it and put it in */
315     PetscCall(PetscSplitReductionCreate(comm, sr));
316     PetscCallMPI(MPI_Comm_set_attr(comm, Petsc_Reduction_keyval, *sr));
317     PetscCall(PetscInfo(0, "Putting reduction data in an MPI_Comm %ld\n", (long)comm));
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 /*@
323   VecDotBegin - Starts a split phase dot product computation.
324 
325   Input Parameters:
326 + x      - the first vector
327 . y      - the second vector
328 - result - where the result will go (can be `NULL`)
329 
330   Level: advanced
331 
332   Notes:
333   Each call to `VecDotBegin()` should be paired with a call to `VecDotEnd()`.
334 
335 .seealso: `VecDotEnd()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
336           `VecTDotBegin()`, `VecTDotEnd()`, `PetscCommSplitReductionBegin()`
337 @*/
VecDotBegin(Vec x,Vec y,PetscScalar * result)338 PetscErrorCode VecDotBegin(Vec x, Vec y, PetscScalar *result)
339 {
340   PetscSplitReduction *sr;
341   MPI_Comm             comm;
342 
343   PetscFunctionBegin;
344   PetscValidHeaderSpecific(x, VEC_CLASSID, 1);
345   PetscValidHeaderSpecific(y, VEC_CLASSID, 2);
346   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
347   if (PetscDefined(HAVE_THREADSAFETY)) {
348     PetscCheck(result, comm, PETSC_ERR_ARG_NULL, "result cannot be NULL when configuring --with-threadsafety");
349     PetscCall(VecDot(x, y, result));
350     PetscFunctionReturn(PETSC_SUCCESS);
351   }
352   PetscCall(PetscSplitReductionGet(comm, &sr));
353   PetscCheck(sr->state == STATE_BEGIN, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Called before all VecxxxEnd() called");
354   if (sr->numopsbegin >= sr->maxops) PetscCall(PetscSplitReductionExtend(sr));
355   sr->reducetype[sr->numopsbegin] = PETSC_SR_REDUCE_SUM;
356   sr->invecs[sr->numopsbegin]     = (void *)x;
357   PetscCall(PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0));
358   PetscUseTypeMethod(x, dot_local, y, sr->lvalues + sr->numopsbegin++);
359   PetscCall(PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0));
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /*@
364   VecDotEnd - Ends a split phase dot product computation.
365 
366   Input Parameters:
367 + x      - the first vector (can be `NULL`)
368 . y      - the second vector (can be `NULL`)
369 - result - where the result will go
370 
371   Level: advanced
372 
373   Notes:
374   Each call to `VecDotBegin()` should be paired with a call to `VecDotEnd()`.
375 
376 .seealso: `VecDotBegin()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
377           `VecTDotBegin()`, `VecTDotEnd()`, `PetscCommSplitReductionBegin()`
378 @*/
VecDotEnd(Vec x,Vec y,PetscScalar * result)379 PetscErrorCode VecDotEnd(Vec x, Vec y, PetscScalar *result)
380 {
381   PetscSplitReduction *sr;
382   MPI_Comm             comm;
383 
384   PetscFunctionBegin;
385   if (PetscDefined(HAVE_THREADSAFETY)) PetscFunctionReturn(PETSC_SUCCESS);
386   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
387   PetscCall(PetscSplitReductionGet(comm, &sr));
388   PetscCall(PetscSplitReductionEnd(sr));
389 
390   PetscCheck(sr->numopsend < sr->numopsbegin, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() more times then VecxxxBegin()");
391   PetscCheck(!x || (void *)x == sr->invecs[sr->numopsend], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() in a different order or with a different vector than VecxxxBegin()");
392   PetscCheck(sr->reducetype[sr->numopsend] == PETSC_SR_REDUCE_SUM, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecDotEnd() on a reduction started with VecNormBegin()");
393   *result = sr->gvalues[sr->numopsend++];
394 
395   /*
396      We are finished getting all the results so reset to no outstanding requests
397   */
398   if (sr->numopsend == sr->numopsbegin) {
399     sr->state       = STATE_BEGIN;
400     sr->numopsend   = 0;
401     sr->numopsbegin = 0;
402     sr->mix         = PETSC_FALSE;
403   }
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 /*@
408   VecTDotBegin - Starts a split phase transpose dot product computation.
409 
410   Input Parameters:
411 + x      - the first vector
412 . y      - the second vector
413 - result - where the result will go (can be `NULL`)
414 
415   Level: advanced
416 
417   Notes:
418   Each call to `VecTDotBegin()` should be paired with a call to `VecTDotEnd()`.
419 
420 .seealso: `VecTDotEnd()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
421           `VecDotBegin()`, `VecDotEnd()`, `PetscCommSplitReductionBegin()`
422 @*/
VecTDotBegin(Vec x,Vec y,PetscScalar * result)423 PetscErrorCode VecTDotBegin(Vec x, Vec y, PetscScalar *result)
424 {
425   PetscSplitReduction *sr;
426   MPI_Comm             comm;
427 
428   PetscFunctionBegin;
429   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
430   if (PetscDefined(HAVE_THREADSAFETY)) {
431     PetscCheck(result, comm, PETSC_ERR_ARG_NULL, "result cannot be NULL when configuring --with-threadsafety");
432     PetscCall(VecTDot(x, y, result));
433     PetscFunctionReturn(PETSC_SUCCESS);
434   }
435   PetscCall(PetscSplitReductionGet(comm, &sr));
436   PetscCheck(sr->state == STATE_BEGIN, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Called before all VecxxxEnd() called");
437   if (sr->numopsbegin >= sr->maxops) PetscCall(PetscSplitReductionExtend(sr));
438   sr->reducetype[sr->numopsbegin] = PETSC_SR_REDUCE_SUM;
439   sr->invecs[sr->numopsbegin]     = (void *)x;
440   PetscCall(PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0));
441   PetscUseTypeMethod(x, tdot_local, y, sr->lvalues + sr->numopsbegin++);
442   PetscCall(PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0));
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /*@
447   VecTDotEnd - Ends a split phase transpose dot product computation.
448 
449   Input Parameters:
450 + x      - the first vector (can be `NULL`)
451 . y      - the second vector (can be `NULL`)
452 - result - where the result will go
453 
454   Level: advanced
455 
456   Notes:
457   Each call to `VecTDotBegin()` should be paired with a call to `VecTDotEnd()`.
458 
459 .seealso: `VecTDotBegin()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
460           `VecDotBegin()`, `VecDotEnd()`
461 @*/
VecTDotEnd(Vec x,Vec y,PetscScalar * result)462 PetscErrorCode VecTDotEnd(Vec x, Vec y, PetscScalar *result)
463 {
464   PetscFunctionBegin;
465   /*
466       TDotEnd() is the same as DotEnd() so reuse the code
467   */
468   PetscCall(VecDotEnd(x, y, result));
469   PetscFunctionReturn(PETSC_SUCCESS);
470 }
471 
472 /*@
473   VecNormBegin - Starts a split phase norm computation.
474 
475   Input Parameters:
476 + x      - the first vector
477 . ntype  - norm type, one of `NORM_1`, `NORM_2`, `NORM_MAX`, `NORM_1_AND_2`
478 - result - where the result will go (can be `NULL`)
479 
480   Level: advanced
481 
482   Notes:
483   Each call to `VecNormBegin()` should be paired with a call to `VecNormEnd()`.
484 
485 .seealso: `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`, `VecDotBegin()`, `VecDotEnd()`, `PetscCommSplitReductionBegin()`
486 @*/
VecNormBegin(Vec x,NormType ntype,PetscReal * result)487 PetscErrorCode VecNormBegin(Vec x, NormType ntype, PetscReal *result)
488 {
489   PetscSplitReduction *sr;
490   PetscReal            lresult[2];
491   MPI_Comm             comm;
492 
493   PetscFunctionBegin;
494   PetscValidHeaderSpecific(x, VEC_CLASSID, 1);
495   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
496   if (PetscDefined(HAVE_THREADSAFETY)) {
497     PetscCheck(result, comm, PETSC_ERR_ARG_NULL, "result cannot be NULL when configuring --with-threadsafety");
498     PetscCall(PetscObjectStateIncrease((PetscObject)x)); // increase PetscObjectState to invalidate cached norms
499     PetscCall(VecNorm(x, ntype, result));
500     PetscFunctionReturn(PETSC_SUCCESS);
501   }
502   PetscCall(PetscSplitReductionGet(comm, &sr));
503   PetscCheck(sr->state == STATE_BEGIN, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Called before all VecxxxEnd() called");
504   if (sr->numopsbegin >= sr->maxops || (sr->numopsbegin == sr->maxops - 1 && ntype == NORM_1_AND_2)) PetscCall(PetscSplitReductionExtend(sr));
505 
506   sr->invecs[sr->numopsbegin] = (void *)x;
507   PetscCall(PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0));
508   PetscUseTypeMethod(x, norm_local, ntype, lresult);
509   PetscCall(PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0));
510   if (ntype == NORM_2) lresult[0] = lresult[0] * lresult[0];
511   if (ntype == NORM_1_AND_2) lresult[1] = lresult[1] * lresult[1];
512   if (ntype == NORM_MAX) sr->reducetype[sr->numopsbegin] = PETSC_SR_REDUCE_MAX;
513   else sr->reducetype[sr->numopsbegin] = PETSC_SR_REDUCE_SUM;
514   sr->lvalues[sr->numopsbegin++] = lresult[0];
515   if (ntype == NORM_1_AND_2) {
516     sr->reducetype[sr->numopsbegin] = PETSC_SR_REDUCE_SUM;
517     sr->lvalues[sr->numopsbegin++]  = lresult[1];
518   }
519   PetscFunctionReturn(PETSC_SUCCESS);
520 }
521 
522 /*@
523   VecNormEnd - Ends a split phase norm computation.
524 
525   Input Parameters:
526 + x      - the first vector
527 . ntype  - norm type, one of `NORM_1`, `NORM_2`, `NORM_MAX`, `NORM_1_AND_2`
528 - result - where the result will go
529 
530   Level: advanced
531 
532   Notes:
533   Each call to `VecNormBegin()` should be paired with a call to `VecNormEnd()`.
534 
535   The `x` vector is not allowed to be `NULL`, otherwise the vector would not have its correctly cached norm value
536 
537 .seealso: `VecNormBegin()`, `VecNorm()`, `VecDot()`, `VecMDot()`, `VecDotBegin()`, `VecDotEnd()`, `PetscCommSplitReductionBegin()`
538 @*/
VecNormEnd(Vec x,NormType ntype,PetscReal * result)539 PetscErrorCode VecNormEnd(Vec x, NormType ntype, PetscReal *result)
540 {
541   PetscSplitReduction *sr;
542   MPI_Comm             comm;
543 
544   PetscFunctionBegin;
545   if (PetscDefined(HAVE_THREADSAFETY)) PetscFunctionReturn(PETSC_SUCCESS);
546   PetscValidHeaderSpecific(x, VEC_CLASSID, 1);
547   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
548   PetscCall(PetscSplitReductionGet(comm, &sr));
549   PetscCall(PetscSplitReductionEnd(sr));
550 
551   PetscCheck(sr->numopsend < sr->numopsbegin, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() more times then VecxxxBegin()");
552   PetscCheck((void *)x == sr->invecs[sr->numopsend], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() in a different order or with a different vector than VecxxxBegin()");
553   PetscCheck(sr->reducetype[sr->numopsend] == PETSC_SR_REDUCE_MAX || ntype != NORM_MAX, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecNormEnd(,NORM_MAX,) on a reduction started with VecDotBegin() or NORM_1 or NORM_2");
554   result[0] = PetscRealPart(sr->gvalues[sr->numopsend++]);
555 
556   if (ntype == NORM_2) result[0] = PetscSqrtReal(result[0]);
557   else if (ntype == NORM_1_AND_2) {
558     result[1] = PetscRealPart(sr->gvalues[sr->numopsend++]);
559     result[1] = PetscSqrtReal(result[1]);
560   }
561   if (ntype != NORM_1_AND_2) PetscCall(PetscObjectComposedDataSetReal((PetscObject)x, NormIds[ntype], result[0]));
562 
563   if (sr->numopsend == sr->numopsbegin) {
564     sr->state       = STATE_BEGIN;
565     sr->numopsend   = 0;
566     sr->numopsbegin = 0;
567   }
568   PetscFunctionReturn(PETSC_SUCCESS);
569 }
570 
571 /*
572    Possibly add
573 
574      PetscReductionSumBegin/End()
575      PetscReductionMaxBegin/End()
576      PetscReductionMinBegin/End()
577    or have more like MPI with a single function with flag for Op? Like first better
578 */
579 
580 /*@
581   VecMDotBegin - Starts a split phase multiple dot product computation.
582 
583   Input Parameters:
584 + x      - the first vector
585 . nv     - number of vectors
586 . y      - array of vectors
587 - result - where the result will go (can be `NULL`)
588 
589   Level: advanced
590 
591   Notes:
592   Each call to `VecMDotBegin()` should be paired with a call to `VecMDotEnd()`.
593 
594 .seealso: `VecMDotEnd()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
595           `VecTDotBegin()`, `VecTDotEnd()`, `VecMTDotBegin()`, `VecMTDotEnd()`, `PetscCommSplitReductionBegin()`
596 @*/
VecMDotBegin(Vec x,PetscInt nv,const Vec y[],PetscScalar result[])597 PetscErrorCode VecMDotBegin(Vec x, PetscInt nv, const Vec y[], PetscScalar result[])
598 {
599   PetscSplitReduction *sr;
600   MPI_Comm             comm;
601   PetscInt             i;
602 
603   PetscFunctionBegin;
604   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
605   if (PetscDefined(HAVE_THREADSAFETY)) {
606     PetscCheck(result, comm, PETSC_ERR_ARG_NULL, "result cannot be NULL when configuring --with-threadsafety");
607     PetscCall(VecMDot(x, nv, y, result));
608     PetscFunctionReturn(PETSC_SUCCESS);
609   }
610   PetscCall(PetscSplitReductionGet(comm, &sr));
611   PetscCheck(sr->state == STATE_BEGIN, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Called before all VecxxxEnd() called");
612   for (i = 0; i < nv; i++) {
613     if (sr->numopsbegin + i >= sr->maxops) PetscCall(PetscSplitReductionExtend(sr));
614     sr->reducetype[sr->numopsbegin + i] = PETSC_SR_REDUCE_SUM;
615     sr->invecs[sr->numopsbegin + i]     = (void *)x;
616   }
617   PetscCall(PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0));
618   PetscUseTypeMethod(x, mdot_local, nv, y, sr->lvalues + sr->numopsbegin);
619   PetscCall(PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0));
620   sr->numopsbegin += nv;
621   PetscFunctionReturn(PETSC_SUCCESS);
622 }
623 
624 /*@
625   VecMDotEnd - Ends a split phase multiple dot product computation.
626 
627   Input Parameters:
628 + x  - the first vector (can be `NULL`)
629 . nv - number of vectors
630 - y  - array of vectors (can be `NULL`)
631 
632   Output Parameter:
633 . result - where the result will go
634 
635   Level: advanced
636 
637   Notes:
638   Each call to `VecMDotBegin()` should be paired with a call to `VecMDotEnd()`.
639 
640 .seealso: `VecMDotBegin()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
641           `VecTDotBegin()`, `VecTDotEnd()`, `VecMTDotBegin()`, `VecMTDotEnd()`, `PetscCommSplitReductionBegin()`
642 @*/
VecMDotEnd(Vec x,PetscInt nv,const Vec y[],PetscScalar result[])643 PetscErrorCode VecMDotEnd(Vec x, PetscInt nv, const Vec y[], PetscScalar result[])
644 {
645   PetscSplitReduction *sr;
646   MPI_Comm             comm;
647   PetscInt             i;
648 
649   PetscFunctionBegin;
650   if (PetscDefined(HAVE_THREADSAFETY)) PetscFunctionReturn(PETSC_SUCCESS);
651   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
652   PetscCall(PetscSplitReductionGet(comm, &sr));
653   PetscCall(PetscSplitReductionEnd(sr));
654 
655   PetscCheck(sr->numopsend < sr->numopsbegin, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() more times then VecxxxBegin()");
656   PetscCheck(!x || (void *)x == sr->invecs[sr->numopsend], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecxxxEnd() in a different order or with a different vector than VecxxxBegin()");
657   PetscCheck(sr->reducetype[sr->numopsend] == PETSC_SR_REDUCE_SUM, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Called VecDotEnd() on a reduction started with VecNormBegin()");
658   for (i = 0; i < nv; i++) result[i] = sr->gvalues[sr->numopsend++];
659 
660   /*
661      We are finished getting all the results so reset to no outstanding requests
662   */
663   if (sr->numopsend == sr->numopsbegin) {
664     sr->state       = STATE_BEGIN;
665     sr->numopsend   = 0;
666     sr->numopsbegin = 0;
667   }
668   PetscFunctionReturn(PETSC_SUCCESS);
669 }
670 
671 /*@
672   VecMTDotBegin - Starts a split phase transpose multiple dot product computation.
673 
674   Input Parameters:
675 + x      - the first vector
676 . nv     - number of vectors
677 . y      - array of  vectors
678 - result - where the result will go (can be `NULL`)
679 
680   Level: advanced
681 
682   Notes:
683   Each call to `VecMTDotBegin()` should be paired with a call to `VecMTDotEnd()`.
684 
685 .seealso: `VecMTDotEnd()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
686           `VecDotBegin()`, `VecDotEnd()`, `VecMDotBegin()`, `VecMDotEnd()`, `PetscCommSplitReductionBegin()`
687 @*/
VecMTDotBegin(Vec x,PetscInt nv,const Vec y[],PetscScalar result[])688 PetscErrorCode VecMTDotBegin(Vec x, PetscInt nv, const Vec y[], PetscScalar result[])
689 {
690   PetscSplitReduction *sr;
691   MPI_Comm             comm;
692   PetscInt             i;
693 
694   PetscFunctionBegin;
695   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
696   if (PetscDefined(HAVE_THREADSAFETY)) {
697     PetscCheck(result, comm, PETSC_ERR_ARG_NULL, "result cannot be NULL when configuring --with-threadsafety");
698     PetscCall(VecMTDot(x, nv, y, result));
699     PetscFunctionReturn(PETSC_SUCCESS);
700   }
701   PetscCall(PetscSplitReductionGet(comm, &sr));
702   PetscCheck(sr->state == STATE_BEGIN, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Called before all VecxxxEnd() called");
703   for (i = 0; i < nv; i++) {
704     if (sr->numopsbegin + i >= sr->maxops) PetscCall(PetscSplitReductionExtend(sr));
705     sr->reducetype[sr->numopsbegin + i] = PETSC_SR_REDUCE_SUM;
706     sr->invecs[sr->numopsbegin + i]     = (void *)x;
707   }
708   PetscCall(PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0));
709   PetscUseTypeMethod(x, mtdot_local, nv, y, sr->lvalues + sr->numopsbegin);
710   PetscCall(PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0));
711   sr->numopsbegin += nv;
712   PetscFunctionReturn(PETSC_SUCCESS);
713 }
714 
715 /*@
716   VecMTDotEnd - Ends a split phase transpose multiple dot product computation.
717 
718   Input Parameters:
719 + x  - the first vector (can be `NULL`)
720 . nv - number of vectors
721 - y  - array of  vectors (can be `NULL`)
722 
723   Output Parameter:
724 . result - where the result will go
725 
726   Level: advanced
727 
728   Notes:
729   Each call to `VecTDotBegin()` should be paired with a call to `VecTDotEnd()`.
730 
731 .seealso: `VecMTDotBegin()`, `VecNormBegin()`, `VecNormEnd()`, `VecNorm()`, `VecDot()`, `VecMDot()`,
732           `VecDotBegin()`, `VecDotEnd()`, `VecMDotBegin()`, `VecMDotEnd()`, `PetscCommSplitReductionBegin()`
733 @*/
VecMTDotEnd(Vec x,PetscInt nv,const Vec y[],PetscScalar result[])734 PetscErrorCode VecMTDotEnd(Vec x, PetscInt nv, const Vec y[], PetscScalar result[])
735 {
736   PetscFunctionBegin;
737   /*
738       MTDotEnd() is the same as MDotEnd() so reuse the code
739   */
740   PetscCall(VecMDotEnd(x, nv, y, result));
741   PetscFunctionReturn(PETSC_SUCCESS);
742 }
743