xref: /petsc/include/petsc/private/cupmatomics.hpp (revision 1dfc8ce42e07d12c18a7dd9b69e074f6cb934f89)
1 #pragma once
2 
3 /*====================================================================================*/
4 /*                             Atomic operations on device                            */
5 /*====================================================================================*/
6 #include <petscdevice_cupm.h>
7 #include <petscsystypes.h>
8 
9 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
10    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
11  */
12 typedef long long int          llint;
13 typedef unsigned long long int ullint;
14 
15 #if PetscDefined(USING_NVCC)
16 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wunused-function")
17 /*
18   Atomic Insert (exchange) operations
19 
20   CUDA C Programming Guide V10.1 Chapter B.12.1.3:
21 
22   int atomicExch(int* address, int val);
23   unsigned int atomicExch(unsigned int* address, unsigned int val);
24   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
25   float atomicExch(float* address, float val);
26 
27   reads the 32-bit or 64-bit word old located at the address in global or shared
28   memory and stores val back to memory at the same address. These two operations are
29   performed in one atomic transaction. The function returns old.
30 
31   PETSc notes:
32 
33   It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
34 
35   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
36   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
37   be atomic itself.
38 
39   With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
40   insertion is atomic. Hope no user codes rely on that.
41 */
atomicExch(double * address,double val)42 __device__ static double atomicExch(double *address, double val)
43 {
44   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
45 }
46 
atomicExch(llint * address,llint val)47 __device__ static llint atomicExch(llint *address, llint val)
48 {
49   return (llint)(atomicExch((ullint *)address, (ullint)val));
50 }
51 
52 template <typename Type>
53 struct AtomicInsert {
operator ()AtomicInsert54   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
55 };
56 
57   #if defined(PETSC_HAVE_COMPLEX)
58     #if defined(PETSC_USE_REAL_DOUBLE)
59 /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
60 template <>
61 struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert62   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
63   {
64     PetscComplex         old, *z = &old;
65     double              *xp = (double *)&x, *yp = (double *)&y;
66     AtomicInsert<double> op;
67     z[0] = op(xp[0], yp[0]);
68     z[1] = op(xp[1], yp[1]);
69     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
70   }
71 };
72     #elif defined(PETSC_USE_REAL_SINGLE)
73 template <>
74 struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert75   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
76   {
77     double              *xp = (double *)&x, *yp = (double *)&y;
78     AtomicInsert<double> op;
79     return op(xp[0], yp[0]);
80   }
81 };
82     #endif
83   #endif
84 
85 /*
86   Atomic add operations
87 
88   CUDA C Programming Guide V10.1 Chapter B.12.1.1:
89 
90   int atomicAdd(int* address, int val);
91   unsigned int atomicAdd(unsigned int* address,unsigned int val);
92   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
93   float atomicAdd(float* address, float val);
94   double atomicAdd(double* address, double val);
95   __half2 atomicAdd(__half2 *address, __half2 val);
96   __half atomicAdd(__half *address, __half val);
97 
98   reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
99   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
100   function returns old.
101 
102   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
103   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
104   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
105   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
106   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
107   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
108 */
atomicAdd(llint * address,llint val)109 __device__ static llint atomicAdd(llint *address, llint val)
110 {
111   return (llint)atomicAdd((ullint *)address, (ullint)val);
112 }
113 
114 template <typename Type>
115 struct AtomicAdd {
operator ()AtomicAdd116   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
117 };
118 
119 template <>
120 struct AtomicAdd<double> {
operator ()AtomicAdd121   __device__ double operator()(double &x, double y) const
122   {
123   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
124     return atomicAdd(&x, y);
125   #else
126     double *address = &x, val = y;
127     ullint *address_as_ull = (ullint *)address;
128     ullint  old            = *address_as_ull, assumed;
129     do {
130       assumed = old;
131       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
132       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
133     } while (assumed != old);
134     return __longlong_as_double(old);
135   #endif
136   }
137 };
138 
139 template <>
140 struct AtomicAdd<float> {
operator ()AtomicAdd141   __device__ float operator()(float &x, float y) const
142   {
143   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
144     return atomicAdd(&x, y);
145   #else
146     float *address = &x, val = y;
147     int   *address_as_int = (int *)address;
148     int    old            = *address_as_int, assumed;
149     do {
150       assumed = old;
151       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
152       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
153     } while (assumed != old);
154     return __int_as_float(old);
155   #endif
156   }
157 };
158 
159   #if defined(PETSC_HAVE_COMPLEX)
160 template <>
161 struct AtomicAdd<PetscComplex> {
operator ()AtomicAdd162   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
163   {
164     PetscComplex         old, *z = &old;
165     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
166     AtomicAdd<PetscReal> op;
167     z[0] = op(xp[0], yp[0]);
168     z[1] = op(xp[1], yp[1]);
169     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
170   }
171 };
172   #endif
173 
174   /*
175   Atomic Mult operations:
176 
177   CUDA has no atomicMult at all, so we build our own with atomicCAS
178  */
179   #if defined(PETSC_USE_REAL_DOUBLE)
atomicMult(double * address,double val)180 __device__ static double atomicMult(double *address, double val)
181 {
182   ullint *address_as_ull = (ullint *)(address);
183   ullint  old            = *address_as_ull, assumed;
184   do {
185     assumed = old;
186     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
187     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
188   } while (assumed != old);
189   return __longlong_as_double(old);
190 }
191   #elif defined(PETSC_USE_REAL_SINGLE)
atomicMult(float * address,float val)192 __device__ static float atomicMult(float *address, float val)
193 {
194   int *address_as_int = (int *)(address);
195   int  old            = *address_as_int, assumed;
196   do {
197     assumed = old;
198     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
199   } while (assumed != old);
200   return __int_as_float(old);
201 }
202   #endif
203 
atomicMult(int * address,int val)204 __device__ static int atomicMult(int *address, int val)
205 {
206   int *address_as_int = (int *)(address);
207   int  old            = *address_as_int, assumed;
208   do {
209     assumed = old;
210     old     = atomicCAS(address_as_int, assumed, val * assumed);
211   } while (assumed != old);
212   return (int)old;
213 }
214 
atomicMult(llint * address,llint val)215 __device__ static llint atomicMult(llint *address, llint val)
216 {
217   ullint *address_as_ull = (ullint *)(address);
218   ullint  old            = *address_as_ull, assumed;
219   do {
220     assumed = old;
221     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
222   } while (assumed != old);
223   return (llint)old;
224 }
225 
226 template <typename Type>
227 struct AtomicMult {
operator ()AtomicMult228   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
229 };
230 
231 /*
232   Atomic Min/Max operations
233 
234   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
235 
236   int atomicMin(int* address, int val);
237   unsigned int atomicMin(unsigned int* address,unsigned int val);
238   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
239 
240   reads the 32-bit or 64-bit word old located at the address in global or shared
241   memory, computes the minimum of old and val, and stores the result back to memory
242   at the same address. These three operations are performed in one atomic transaction.
243   The function returns old.
244   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
245 
246   atomicMax() is similar.
247  */
248 
249   #if defined(PETSC_USE_REAL_DOUBLE)
atomicMin(double * address,double val)250 __device__ static double atomicMin(double *address, double val)
251 {
252   ullint *address_as_ull = (ullint *)(address);
253   ullint  old            = *address_as_ull, assumed;
254   do {
255     assumed = old;
256     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
257   } while (assumed != old);
258   return __longlong_as_double(old);
259 }
260 
atomicMax(double * address,double val)261 __device__ static double atomicMax(double *address, double val)
262 {
263   ullint *address_as_ull = (ullint *)(address);
264   ullint  old            = *address_as_ull, assumed;
265   do {
266     assumed = old;
267     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
268   } while (assumed != old);
269   return __longlong_as_double(old);
270 }
271   #elif defined(PETSC_USE_REAL_SINGLE)
atomicMin(float * address,float val)272 __device__ static float atomicMin(float *address, float val)
273 {
274   int *address_as_int = (int *)(address);
275   int  old            = *address_as_int, assumed;
276   do {
277     assumed = old;
278     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
279   } while (assumed != old);
280   return __int_as_float(old);
281 }
282 
atomicMax(float * address,float val)283 __device__ static float atomicMax(float *address, float val)
284 {
285   int *address_as_int = (int *)(address);
286   int  old            = *address_as_int, assumed;
287   do {
288     assumed = old;
289     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
290   } while (assumed != old);
291   return __int_as_float(old);
292 }
293   #endif
294 
295   /*
296   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
297   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
298   This causes compilation errors with pgi compilers and 64-bit indices:
299       error: function "atomicMin(long long *, long long)" has already been defined
300 
301   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
302 */
303   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
atomicMin(llint * address,llint val)304 __device__ static llint atomicMin(llint *address, llint val)
305 {
306   ullint *address_as_ull = (ullint *)(address);
307   ullint  old            = *address_as_ull, assumed;
308   do {
309     assumed = old;
310     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
311   } while (assumed != old);
312   return (llint)old;
313 }
314 
atomicMax(llint * address,llint val)315 __device__ static llint atomicMax(llint *address, llint val)
316 {
317   ullint *address_as_ull = (ullint *)(address);
318   ullint  old            = *address_as_ull, assumed;
319   do {
320     assumed = old;
321     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
322   } while (assumed != old);
323   return (llint)old;
324 }
325   #endif
326 
327 template <typename Type>
328 struct AtomicMin {
operator ()AtomicMin329   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
330 };
331 template <typename Type>
332 struct AtomicMax {
operator ()AtomicMax333   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
334 };
335 
336 /*
337   Atomic bitwise operations
338 
339   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
340 
341   int atomicAnd(int* address, int val);
342   unsigned int atomicAnd(unsigned int* address,unsigned int val);
343   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
344 
345   reads the 32-bit or 64-bit word old located at the address in global or shared
346   memory, computes (old & val), and stores the result back to memory at the same
347   address. These three operations are performed in one atomic transaction.
348   The function returns old.
349 
350   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
351 
352   atomicOr() and atomicXor are similar.
353 */
354 
355   #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
atomicAnd(llint * address,llint val)356 __device__ static llint atomicAnd(llint *address, llint val)
357 {
358   ullint *address_as_ull = (ullint *)(address);
359   ullint  old            = *address_as_ull, assumed;
360   do {
361     assumed = old;
362     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
363   } while (assumed != old);
364   return (llint)old;
365 }
atomicOr(llint * address,llint val)366 __device__ static llint atomicOr(llint *address, llint val)
367 {
368   ullint *address_as_ull = (ullint *)(address);
369   ullint  old            = *address_as_ull, assumed;
370   do {
371     assumed = old;
372     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
373   } while (assumed != old);
374   return (llint)old;
375 }
376 
atomicXor(llint * address,llint val)377 __device__ static llint atomicXor(llint *address, llint val)
378 {
379   ullint *address_as_ull = (ullint *)(address);
380   ullint  old            = *address_as_ull, assumed;
381   do {
382     assumed = old;
383     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
384   } while (assumed != old);
385   return (llint)old;
386 }
387   #endif
388 
389 template <typename Type>
390 struct AtomicBAND {
operator ()AtomicBAND391   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
392 };
393 template <typename Type>
394 struct AtomicBOR {
operator ()AtomicBOR395   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
396 };
397 template <typename Type>
398 struct AtomicBXOR {
operator ()AtomicBXOR399   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
400 };
401 
402 /*
403   Atomic logical operations:
404 
405   CUDA has no atomic logical operations at all. We support them on integer types.
406 */
407 
408 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
409    which is what we want since we only support 32-bit and 64-bit integers.
410  */
411 template <typename Type, class Op, int size /* sizeof(Type) */>
412 struct AtomicLogical;
413 
414 template <typename Type, class Op>
415 struct AtomicLogical<Type, Op, 4> {
operator ()AtomicLogical416   __device__ Type operator()(Type &x, Type y) const
417   {
418     int *address_as_int = (int *)(&x);
419     int  old            = *address_as_int, assumed;
420     Op   op;
421     do {
422       assumed = old;
423       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
424     } while (assumed != old);
425     return (Type)old;
426   }
427 };
428 
429 template <typename Type, class Op>
430 struct AtomicLogical<Type, Op, 8> {
operator ()AtomicLogical431   __device__ Type operator()(Type &x, Type y) const
432   {
433     ullint *address_as_ull = (ullint *)(&x);
434     ullint  old            = *address_as_ull, assumed;
435     Op      op;
436     do {
437       assumed = old;
438       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
439     } while (assumed != old);
440     return (Type)old;
441   }
442 };
443 
444 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
445 template <typename Type>
446 struct land {
operator ()land447   __device__ Type operator()(Type x, Type y) { return x && y; }
448 };
449 template <typename Type>
450 struct lor {
operator ()lor451   __device__ Type operator()(Type x, Type y) { return x || y; }
452 };
453 template <typename Type>
454 struct lxor {
operator ()lxor455   __device__ Type operator()(Type x, Type y) { return !x != !y; }
456 };
457 
458 template <typename Type>
459 struct AtomicLAND {
operator ()AtomicLAND460   __device__ Type operator()(Type &x, Type y) const
461   {
462     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
463     return op(x, y);
464   }
465 };
466 template <typename Type>
467 struct AtomicLOR {
operator ()AtomicLOR468   __device__ Type operator()(Type &x, Type y) const
469   {
470     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
471     return op(x, y);
472   }
473 };
474 template <typename Type>
475 struct AtomicLXOR {
operator ()AtomicLXOR476   __device__ Type operator()(Type &x, Type y) const
477   {
478     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
479     return op(x, y);
480   }
481 };
482 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
483 #elif PetscDefined(USING_HCC)
484 
485   /*
486   Atomic Insert (exchange) operations
487 
488   See Cuda version
489 */
490   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
491 __device__ static double atomicExch(double *address, double val)
492 {
493   return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
494 }
495   #endif
496 
497 __device__ static inline llint atomicExch(llint *address, llint val)
498 {
499   return (llint)(atomicExch((ullint *)address, (ullint)val));
500 }
501 
502 template <typename Type>
503 struct AtomicInsert {
504   __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
505 };
506 
507   #if defined(PETSC_HAVE_COMPLEX)
508     #if defined(PETSC_USE_REAL_DOUBLE)
509 template <>
510 struct AtomicInsert<PetscComplex> {
511   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
512   {
513     PetscComplex         old, *z = &old;
514     double              *xp = (double *)&x, *yp = (double *)&y;
515     AtomicInsert<double> op;
516     z[0] = op(xp[0], yp[0]);
517     z[1] = op(xp[1], yp[1]);
518     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
519   }
520 };
521     #elif defined(PETSC_USE_REAL_SINGLE)
522 template <>
523 struct AtomicInsert<PetscComplex> {
524   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
525   {
526     double              *xp = (double *)&x, *yp = (double *)&y;
527     AtomicInsert<double> op;
528     return op(xp[0], yp[0]);
529   }
530 };
531     #endif
532   #endif
533 
534 /*
535   Atomic add operations
536 
537 */
538 __device__ static inline llint atomicAdd(llint *address, llint val)
539 {
540   return (llint)atomicAdd((ullint *)address, (ullint)val);
541 }
542 
543 template <typename Type>
544 struct AtomicAdd {
545   __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
546 };
547 
548 template <>
549 struct AtomicAdd<double> {
550   __device__ double operator()(double &x, double y) const
551   {
552     /* Cuda version does more checks that may be needed */
553     return atomicAdd(&x, y);
554   }
555 };
556 
557 template <>
558 struct AtomicAdd<float> {
559   __device__ float operator()(float &x, float y) const
560   {
561     /* Cuda version does more checks that may be needed */
562     return atomicAdd(&x, y);
563   }
564 };
565 
566   #if defined(PETSC_HAVE_COMPLEX)
567 template <>
568 struct AtomicAdd<PetscComplex> {
569   __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
570   {
571     PetscComplex         old, *z = &old;
572     PetscReal           *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
573     AtomicAdd<PetscReal> op;
574     z[0] = op(xp[0], yp[0]);
575     z[1] = op(xp[1], yp[1]);
576     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
577   }
578 };
579   #endif
580 
581   /*
582   Atomic Mult operations:
583 
584   HIP has no atomicMult at all, so we build our own with atomicCAS
585  */
586   #if defined(PETSC_USE_REAL_DOUBLE)
587 __device__ static inline double atomicMult(double *address, double val)
588 {
589   ullint *address_as_ull = (ullint *)(address);
590   ullint  old            = *address_as_ull, assumed;
591   do {
592     assumed = old;
593     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
594     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
595   } while (assumed != old);
596   return __longlong_as_double(old);
597 }
598   #elif defined(PETSC_USE_REAL_SINGLE)
599 __device__ static inline float atomicMult(float *address, float val)
600 {
601   int *address_as_int = (int *)(address);
602   int  old            = *address_as_int, assumed;
603   do {
604     assumed = old;
605     old     = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
606   } while (assumed != old);
607   return __int_as_float(old);
608 }
609   #endif
610 
611 __device__ static inline int atomicMult(int *address, int val)
612 {
613   int *address_as_int = (int *)(address);
614   int  old            = *address_as_int, assumed;
615   do {
616     assumed = old;
617     old     = atomicCAS(address_as_int, assumed, val * assumed);
618   } while (assumed != old);
619   return (int)old;
620 }
621 
622 __device__ static inline llint atomicMult(llint *address, llint val)
623 {
624   ullint *address_as_ull = (ullint *)(address);
625   ullint  old            = *address_as_ull, assumed;
626   do {
627     assumed = old;
628     old     = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
629   } while (assumed != old);
630   return (llint)old;
631 }
632 
633 template <typename Type>
634 struct AtomicMult {
635   __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
636 };
637 
638   /*
639   Atomic Min/Max operations
640 
641   See CUDA version for comments.
642  */
643   #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
644     #if defined(PETSC_USE_REAL_DOUBLE)
645 __device__ static double atomicMin(double *address, double val)
646 {
647   ullint *address_as_ull = (ullint *)(address);
648   ullint  old            = *address_as_ull, assumed;
649   do {
650     assumed = old;
651     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
652   } while (assumed != old);
653   return __longlong_as_double(old);
654 }
655 
656 __device__ static double atomicMax(double *address, double val)
657 {
658   ullint *address_as_ull = (ullint *)(address);
659   ullint  old            = *address_as_ull, assumed;
660   do {
661     assumed = old;
662     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
663   } while (assumed != old);
664   return __longlong_as_double(old);
665 }
666     #elif defined(PETSC_USE_REAL_SINGLE)
667 __device__ static float atomicMin(float *address, float val)
668 {
669   int *address_as_int = (int *)(address);
670   int  old            = *address_as_int, assumed;
671   do {
672     assumed = old;
673     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
674   } while (assumed != old);
675   return __int_as_float(old);
676 }
677 
678 __device__ static float atomicMax(float *address, float val)
679 {
680   int *address_as_int = (int *)(address);
681   int  old            = *address_as_int, assumed;
682   do {
683     assumed = old;
684     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
685   } while (assumed != old);
686   return __int_as_float(old);
687 }
688     #endif
689   #endif
690 
691   #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
692 __device__ static inline llint atomicMin(llint *address, llint val)
693 {
694   ullint *address_as_ull = (ullint *)(address);
695   ullint  old            = *address_as_ull, assumed;
696   do {
697     assumed = old;
698     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
699   } while (assumed != old);
700   return (llint)old;
701 }
702 
703 __device__ static inline llint atomicMax(llint *address, llint val)
704 {
705   ullint *address_as_ull = (ullint *)(address);
706   ullint  old            = *address_as_ull, assumed;
707   do {
708     assumed = old;
709     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
710   } while (assumed != old);
711   return (llint)old;
712 }
713   #endif
714 
715 template <typename Type>
716 struct AtomicMin {
717   __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
718 };
719 template <typename Type>
720 struct AtomicMax {
721   __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
722 };
723 
724 /*
725   Atomic bitwise operations
726   As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
727 */
728 
729 __device__ static inline llint atomicAnd(llint *address, llint val)
730 {
731   ullint *address_as_ull = (ullint *)(address);
732   ullint  old            = *address_as_ull, assumed;
733   do {
734     assumed = old;
735     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
736   } while (assumed != old);
737   return (llint)old;
738 }
739 __device__ static inline llint atomicOr(llint *address, llint val)
740 {
741   ullint *address_as_ull = (ullint *)(address);
742   ullint  old            = *address_as_ull, assumed;
743   do {
744     assumed = old;
745     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
746   } while (assumed != old);
747   return (llint)old;
748 }
749 
750 __device__ static inline llint atomicXor(llint *address, llint val)
751 {
752   ullint *address_as_ull = (ullint *)(address);
753   ullint  old            = *address_as_ull, assumed;
754   do {
755     assumed = old;
756     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
757   } while (assumed != old);
758   return (llint)old;
759 }
760 
761 template <typename Type>
762 struct AtomicBAND {
763   __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
764 };
765 template <typename Type>
766 struct AtomicBOR {
767   __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
768 };
769 template <typename Type>
770 struct AtomicBXOR {
771   __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
772 };
773 
774 /*
775   Atomic logical operations:
776 
777   CUDA has no atomic logical operations at all. We support them on integer types.
778 */
779 
780 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
781    which is what we want since we only support 32-bit and 64-bit integers.
782  */
783 template <typename Type, class Op, int size /* sizeof(Type) */>
784 struct AtomicLogical;
785 
786 template <typename Type, class Op>
787 struct AtomicLogical<Type, Op, 4> {
788   __device__ Type operator()(Type &x, Type y) const
789   {
790     int *address_as_int = (int *)(&x);
791     int  old            = *address_as_int, assumed;
792     Op   op;
793     do {
794       assumed = old;
795       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
796     } while (assumed != old);
797     return (Type)old;
798   }
799 };
800 
801 template <typename Type, class Op>
802 struct AtomicLogical<Type, Op, 8> {
803   __device__ Type operator()(Type &x, Type y) const
804   {
805     ullint *address_as_ull = (ullint *)(&x);
806     ullint  old            = *address_as_ull, assumed;
807     Op      op;
808     do {
809       assumed = old;
810       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
811     } while (assumed != old);
812     return (Type)old;
813   }
814 };
815 
816 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
817 template <typename Type>
818 struct land {
819   __device__ Type operator()(Type x, Type y) { return x && y; }
820 };
821 template <typename Type>
822 struct lor {
823   __device__ Type operator()(Type x, Type y) { return x || y; }
824 };
825 template <typename Type>
826 struct lxor {
827   __device__ Type operator()(Type x, Type y) { return !x != !y; }
828 };
829 
830 template <typename Type>
831 struct AtomicLAND {
832   __device__ Type operator()(Type &x, Type y) const
833   {
834     AtomicLogical<Type, land<Type>, sizeof(Type)> op;
835     return op(x, y);
836   }
837 };
838 template <typename Type>
839 struct AtomicLOR {
840   __device__ Type operator()(Type &x, Type y) const
841   {
842     AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
843     return op(x, y);
844   }
845 };
846 template <typename Type>
847 struct AtomicLXOR {
848   __device__ Type operator()(Type &x, Type y) const
849   {
850     AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
851     return op(x, y);
852   }
853 };
854 #endif
855