1 #pragma once
2
3 /*====================================================================================*/
4 /* Atomic operations on device */
5 /*====================================================================================*/
6 #include <petscdevice_cupm.h>
7 #include <petscsystypes.h>
8
9 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
10 We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
11 */
12 typedef long long int llint;
13 typedef unsigned long long int ullint;
14
15 #if PetscDefined(USING_NVCC)
16 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wunused-function")
17 /*
18 Atomic Insert (exchange) operations
19
20 CUDA C Programming Guide V10.1 Chapter B.12.1.3:
21
22 int atomicExch(int* address, int val);
23 unsigned int atomicExch(unsigned int* address, unsigned int val);
24 unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
25 float atomicExch(float* address, float val);
26
27 reads the 32-bit or 64-bit word old located at the address in global or shared
28 memory and stores val back to memory at the same address. These two operations are
29 performed in one atomic transaction. The function returns old.
30
31 PETSc notes:
32
33 It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
34
35 VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
36 atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
37 be atomic itself.
38
39 With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
40 insertion is atomic. Hope no user codes rely on that.
41 */
atomicExch(double * address,double val)42 __device__ static double atomicExch(double *address, double val)
43 {
44 return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
45 }
46
atomicExch(llint * address,llint val)47 __device__ static llint atomicExch(llint *address, llint val)
48 {
49 return (llint)(atomicExch((ullint *)address, (ullint)val));
50 }
51
52 template <typename Type>
53 struct AtomicInsert {
operator ()AtomicInsert54 __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
55 };
56
57 #if defined(PETSC_HAVE_COMPLEX)
58 #if defined(PETSC_USE_REAL_DOUBLE)
59 /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
60 template <>
61 struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert62 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
63 {
64 PetscComplex old, *z = &old;
65 double *xp = (double *)&x, *yp = (double *)&y;
66 AtomicInsert<double> op;
67 z[0] = op(xp[0], yp[0]);
68 z[1] = op(xp[1], yp[1]);
69 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
70 }
71 };
72 #elif defined(PETSC_USE_REAL_SINGLE)
73 template <>
74 struct AtomicInsert<PetscComplex> {
operator ()AtomicInsert75 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
76 {
77 double *xp = (double *)&x, *yp = (double *)&y;
78 AtomicInsert<double> op;
79 return op(xp[0], yp[0]);
80 }
81 };
82 #endif
83 #endif
84
85 /*
86 Atomic add operations
87
88 CUDA C Programming Guide V10.1 Chapter B.12.1.1:
89
90 int atomicAdd(int* address, int val);
91 unsigned int atomicAdd(unsigned int* address,unsigned int val);
92 unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
93 float atomicAdd(float* address, float val);
94 double atomicAdd(double* address, double val);
95 __half2 atomicAdd(__half2 *address, __half2 val);
96 __half atomicAdd(__half *address, __half val);
97
98 reads the 16-bit, 32-bit or 64-bit word old located at the address in global or shared memory, computes (old + val),
99 and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
100 function returns old.
101
102 The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
103 The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
104 The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
105 higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
106 the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
107 The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
108 */
atomicAdd(llint * address,llint val)109 __device__ static llint atomicAdd(llint *address, llint val)
110 {
111 return (llint)atomicAdd((ullint *)address, (ullint)val);
112 }
113
114 template <typename Type>
115 struct AtomicAdd {
operator ()AtomicAdd116 __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
117 };
118
119 template <>
120 struct AtomicAdd<double> {
operator ()AtomicAdd121 __device__ double operator()(double &x, double y) const
122 {
123 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
124 return atomicAdd(&x, y);
125 #else
126 double *address = &x, val = y;
127 ullint *address_as_ull = (ullint *)address;
128 ullint old = *address_as_ull, assumed;
129 do {
130 assumed = old;
131 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
132 /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
133 } while (assumed != old);
134 return __longlong_as_double(old);
135 #endif
136 }
137 };
138
139 template <>
140 struct AtomicAdd<float> {
operator ()AtomicAdd141 __device__ float operator()(float &x, float y) const
142 {
143 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
144 return atomicAdd(&x, y);
145 #else
146 float *address = &x, val = y;
147 int *address_as_int = (int *)address;
148 int old = *address_as_int, assumed;
149 do {
150 assumed = old;
151 old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
152 /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
153 } while (assumed != old);
154 return __int_as_float(old);
155 #endif
156 }
157 };
158
159 #if defined(PETSC_HAVE_COMPLEX)
160 template <>
161 struct AtomicAdd<PetscComplex> {
operator ()AtomicAdd162 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
163 {
164 PetscComplex old, *z = &old;
165 PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
166 AtomicAdd<PetscReal> op;
167 z[0] = op(xp[0], yp[0]);
168 z[1] = op(xp[1], yp[1]);
169 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
170 }
171 };
172 #endif
173
174 /*
175 Atomic Mult operations:
176
177 CUDA has no atomicMult at all, so we build our own with atomicCAS
178 */
179 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMult(double * address,double val)180 __device__ static double atomicMult(double *address, double val)
181 {
182 ullint *address_as_ull = (ullint *)(address);
183 ullint old = *address_as_ull, assumed;
184 do {
185 assumed = old;
186 /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
187 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
188 } while (assumed != old);
189 return __longlong_as_double(old);
190 }
191 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMult(float * address,float val)192 __device__ static float atomicMult(float *address, float val)
193 {
194 int *address_as_int = (int *)(address);
195 int old = *address_as_int, assumed;
196 do {
197 assumed = old;
198 old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
199 } while (assumed != old);
200 return __int_as_float(old);
201 }
202 #endif
203
atomicMult(int * address,int val)204 __device__ static int atomicMult(int *address, int val)
205 {
206 int *address_as_int = (int *)(address);
207 int old = *address_as_int, assumed;
208 do {
209 assumed = old;
210 old = atomicCAS(address_as_int, assumed, val * assumed);
211 } while (assumed != old);
212 return (int)old;
213 }
214
atomicMult(llint * address,llint val)215 __device__ static llint atomicMult(llint *address, llint val)
216 {
217 ullint *address_as_ull = (ullint *)(address);
218 ullint old = *address_as_ull, assumed;
219 do {
220 assumed = old;
221 old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
222 } while (assumed != old);
223 return (llint)old;
224 }
225
226 template <typename Type>
227 struct AtomicMult {
operator ()AtomicMult228 __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
229 };
230
231 /*
232 Atomic Min/Max operations
233
234 CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
235
236 int atomicMin(int* address, int val);
237 unsigned int atomicMin(unsigned int* address,unsigned int val);
238 unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
239
240 reads the 32-bit or 64-bit word old located at the address in global or shared
241 memory, computes the minimum of old and val, and stores the result back to memory
242 at the same address. These three operations are performed in one atomic transaction.
243 The function returns old.
244 The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
245
246 atomicMax() is similar.
247 */
248
249 #if defined(PETSC_USE_REAL_DOUBLE)
atomicMin(double * address,double val)250 __device__ static double atomicMin(double *address, double val)
251 {
252 ullint *address_as_ull = (ullint *)(address);
253 ullint old = *address_as_ull, assumed;
254 do {
255 assumed = old;
256 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
257 } while (assumed != old);
258 return __longlong_as_double(old);
259 }
260
atomicMax(double * address,double val)261 __device__ static double atomicMax(double *address, double val)
262 {
263 ullint *address_as_ull = (ullint *)(address);
264 ullint old = *address_as_ull, assumed;
265 do {
266 assumed = old;
267 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
268 } while (assumed != old);
269 return __longlong_as_double(old);
270 }
271 #elif defined(PETSC_USE_REAL_SINGLE)
atomicMin(float * address,float val)272 __device__ static float atomicMin(float *address, float val)
273 {
274 int *address_as_int = (int *)(address);
275 int old = *address_as_int, assumed;
276 do {
277 assumed = old;
278 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
279 } while (assumed != old);
280 return __int_as_float(old);
281 }
282
atomicMax(float * address,float val)283 __device__ static float atomicMax(float *address, float val)
284 {
285 int *address_as_int = (int *)(address);
286 int old = *address_as_int, assumed;
287 do {
288 assumed = old;
289 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
290 } while (assumed != old);
291 return __int_as_float(old);
292 }
293 #endif
294
295 /*
296 atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
297 atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
298 This causes compilation errors with pgi compilers and 64-bit indices:
299 error: function "atomicMin(long long *, long long)" has already been defined
300
301 So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
302 */
303 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
atomicMin(llint * address,llint val)304 __device__ static llint atomicMin(llint *address, llint val)
305 {
306 ullint *address_as_ull = (ullint *)(address);
307 ullint old = *address_as_ull, assumed;
308 do {
309 assumed = old;
310 old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
311 } while (assumed != old);
312 return (llint)old;
313 }
314
atomicMax(llint * address,llint val)315 __device__ static llint atomicMax(llint *address, llint val)
316 {
317 ullint *address_as_ull = (ullint *)(address);
318 ullint old = *address_as_ull, assumed;
319 do {
320 assumed = old;
321 old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
322 } while (assumed != old);
323 return (llint)old;
324 }
325 #endif
326
327 template <typename Type>
328 struct AtomicMin {
operator ()AtomicMin329 __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
330 };
331 template <typename Type>
332 struct AtomicMax {
operator ()AtomicMax333 __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
334 };
335
336 /*
337 Atomic bitwise operations
338
339 CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
340
341 int atomicAnd(int* address, int val);
342 unsigned int atomicAnd(unsigned int* address,unsigned int val);
343 unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
344
345 reads the 32-bit or 64-bit word old located at the address in global or shared
346 memory, computes (old & val), and stores the result back to memory at the same
347 address. These three operations are performed in one atomic transaction.
348 The function returns old.
349
350 The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
351
352 atomicOr() and atomicXor are similar.
353 */
354
355 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
atomicAnd(llint * address,llint val)356 __device__ static llint atomicAnd(llint *address, llint val)
357 {
358 ullint *address_as_ull = (ullint *)(address);
359 ullint old = *address_as_ull, assumed;
360 do {
361 assumed = old;
362 old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
363 } while (assumed != old);
364 return (llint)old;
365 }
atomicOr(llint * address,llint val)366 __device__ static llint atomicOr(llint *address, llint val)
367 {
368 ullint *address_as_ull = (ullint *)(address);
369 ullint old = *address_as_ull, assumed;
370 do {
371 assumed = old;
372 old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
373 } while (assumed != old);
374 return (llint)old;
375 }
376
atomicXor(llint * address,llint val)377 __device__ static llint atomicXor(llint *address, llint val)
378 {
379 ullint *address_as_ull = (ullint *)(address);
380 ullint old = *address_as_ull, assumed;
381 do {
382 assumed = old;
383 old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
384 } while (assumed != old);
385 return (llint)old;
386 }
387 #endif
388
389 template <typename Type>
390 struct AtomicBAND {
operator ()AtomicBAND391 __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
392 };
393 template <typename Type>
394 struct AtomicBOR {
operator ()AtomicBOR395 __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
396 };
397 template <typename Type>
398 struct AtomicBXOR {
operator ()AtomicBXOR399 __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
400 };
401
402 /*
403 Atomic logical operations:
404
405 CUDA has no atomic logical operations at all. We support them on integer types.
406 */
407
408 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
409 which is what we want since we only support 32-bit and 64-bit integers.
410 */
411 template <typename Type, class Op, int size /* sizeof(Type) */>
412 struct AtomicLogical;
413
414 template <typename Type, class Op>
415 struct AtomicLogical<Type, Op, 4> {
operator ()AtomicLogical416 __device__ Type operator()(Type &x, Type y) const
417 {
418 int *address_as_int = (int *)(&x);
419 int old = *address_as_int, assumed;
420 Op op;
421 do {
422 assumed = old;
423 old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
424 } while (assumed != old);
425 return (Type)old;
426 }
427 };
428
429 template <typename Type, class Op>
430 struct AtomicLogical<Type, Op, 8> {
operator ()AtomicLogical431 __device__ Type operator()(Type &x, Type y) const
432 {
433 ullint *address_as_ull = (ullint *)(&x);
434 ullint old = *address_as_ull, assumed;
435 Op op;
436 do {
437 assumed = old;
438 old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
439 } while (assumed != old);
440 return (Type)old;
441 }
442 };
443
444 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
445 template <typename Type>
446 struct land {
operator ()land447 __device__ Type operator()(Type x, Type y) { return x && y; }
448 };
449 template <typename Type>
450 struct lor {
operator ()lor451 __device__ Type operator()(Type x, Type y) { return x || y; }
452 };
453 template <typename Type>
454 struct lxor {
operator ()lxor455 __device__ Type operator()(Type x, Type y) { return !x != !y; }
456 };
457
458 template <typename Type>
459 struct AtomicLAND {
operator ()AtomicLAND460 __device__ Type operator()(Type &x, Type y) const
461 {
462 AtomicLogical<Type, land<Type>, sizeof(Type)> op;
463 return op(x, y);
464 }
465 };
466 template <typename Type>
467 struct AtomicLOR {
operator ()AtomicLOR468 __device__ Type operator()(Type &x, Type y) const
469 {
470 AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
471 return op(x, y);
472 }
473 };
474 template <typename Type>
475 struct AtomicLXOR {
operator ()AtomicLXOR476 __device__ Type operator()(Type &x, Type y) const
477 {
478 AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
479 return op(x, y);
480 }
481 };
482 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
483 #elif PetscDefined(USING_HCC)
484
485 /*
486 Atomic Insert (exchange) operations
487
488 See Cuda version
489 */
490 #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
491 __device__ static double atomicExch(double *address, double val)
492 {
493 return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
494 }
495 #endif
496
497 __device__ static inline llint atomicExch(llint *address, llint val)
498 {
499 return (llint)(atomicExch((ullint *)address, (ullint)val));
500 }
501
502 template <typename Type>
503 struct AtomicInsert {
504 __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
505 };
506
507 #if defined(PETSC_HAVE_COMPLEX)
508 #if defined(PETSC_USE_REAL_DOUBLE)
509 template <>
510 struct AtomicInsert<PetscComplex> {
511 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
512 {
513 PetscComplex old, *z = &old;
514 double *xp = (double *)&x, *yp = (double *)&y;
515 AtomicInsert<double> op;
516 z[0] = op(xp[0], yp[0]);
517 z[1] = op(xp[1], yp[1]);
518 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
519 }
520 };
521 #elif defined(PETSC_USE_REAL_SINGLE)
522 template <>
523 struct AtomicInsert<PetscComplex> {
524 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
525 {
526 double *xp = (double *)&x, *yp = (double *)&y;
527 AtomicInsert<double> op;
528 return op(xp[0], yp[0]);
529 }
530 };
531 #endif
532 #endif
533
534 /*
535 Atomic add operations
536
537 */
538 __device__ static inline llint atomicAdd(llint *address, llint val)
539 {
540 return (llint)atomicAdd((ullint *)address, (ullint)val);
541 }
542
543 template <typename Type>
544 struct AtomicAdd {
545 __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
546 };
547
548 template <>
549 struct AtomicAdd<double> {
550 __device__ double operator()(double &x, double y) const
551 {
552 /* Cuda version does more checks that may be needed */
553 return atomicAdd(&x, y);
554 }
555 };
556
557 template <>
558 struct AtomicAdd<float> {
559 __device__ float operator()(float &x, float y) const
560 {
561 /* Cuda version does more checks that may be needed */
562 return atomicAdd(&x, y);
563 }
564 };
565
566 #if defined(PETSC_HAVE_COMPLEX)
567 template <>
568 struct AtomicAdd<PetscComplex> {
569 __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
570 {
571 PetscComplex old, *z = &old;
572 PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
573 AtomicAdd<PetscReal> op;
574 z[0] = op(xp[0], yp[0]);
575 z[1] = op(xp[1], yp[1]);
576 return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
577 }
578 };
579 #endif
580
581 /*
582 Atomic Mult operations:
583
584 HIP has no atomicMult at all, so we build our own with atomicCAS
585 */
586 #if defined(PETSC_USE_REAL_DOUBLE)
587 __device__ static inline double atomicMult(double *address, double val)
588 {
589 ullint *address_as_ull = (ullint *)(address);
590 ullint old = *address_as_ull, assumed;
591 do {
592 assumed = old;
593 /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
594 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
595 } while (assumed != old);
596 return __longlong_as_double(old);
597 }
598 #elif defined(PETSC_USE_REAL_SINGLE)
599 __device__ static inline float atomicMult(float *address, float val)
600 {
601 int *address_as_int = (int *)(address);
602 int old = *address_as_int, assumed;
603 do {
604 assumed = old;
605 old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
606 } while (assumed != old);
607 return __int_as_float(old);
608 }
609 #endif
610
611 __device__ static inline int atomicMult(int *address, int val)
612 {
613 int *address_as_int = (int *)(address);
614 int old = *address_as_int, assumed;
615 do {
616 assumed = old;
617 old = atomicCAS(address_as_int, assumed, val * assumed);
618 } while (assumed != old);
619 return (int)old;
620 }
621
622 __device__ static inline llint atomicMult(llint *address, llint val)
623 {
624 ullint *address_as_ull = (ullint *)(address);
625 ullint old = *address_as_ull, assumed;
626 do {
627 assumed = old;
628 old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
629 } while (assumed != old);
630 return (llint)old;
631 }
632
633 template <typename Type>
634 struct AtomicMult {
635 __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
636 };
637
638 /*
639 Atomic Min/Max operations
640
641 See CUDA version for comments.
642 */
643 #if PETSC_PKG_HIP_VERSION_LT(4, 4, 0)
644 #if defined(PETSC_USE_REAL_DOUBLE)
645 __device__ static double atomicMin(double *address, double val)
646 {
647 ullint *address_as_ull = (ullint *)(address);
648 ullint old = *address_as_ull, assumed;
649 do {
650 assumed = old;
651 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
652 } while (assumed != old);
653 return __longlong_as_double(old);
654 }
655
656 __device__ static double atomicMax(double *address, double val)
657 {
658 ullint *address_as_ull = (ullint *)(address);
659 ullint old = *address_as_ull, assumed;
660 do {
661 assumed = old;
662 old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
663 } while (assumed != old);
664 return __longlong_as_double(old);
665 }
666 #elif defined(PETSC_USE_REAL_SINGLE)
667 __device__ static float atomicMin(float *address, float val)
668 {
669 int *address_as_int = (int *)(address);
670 int old = *address_as_int, assumed;
671 do {
672 assumed = old;
673 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
674 } while (assumed != old);
675 return __int_as_float(old);
676 }
677
678 __device__ static float atomicMax(float *address, float val)
679 {
680 int *address_as_int = (int *)(address);
681 int old = *address_as_int, assumed;
682 do {
683 assumed = old;
684 old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
685 } while (assumed != old);
686 return __int_as_float(old);
687 }
688 #endif
689 #endif
690
691 #if PETSC_PKG_HIP_VERSION_LT(5, 7, 0)
692 __device__ static inline llint atomicMin(llint *address, llint val)
693 {
694 ullint *address_as_ull = (ullint *)(address);
695 ullint old = *address_as_ull, assumed;
696 do {
697 assumed = old;
698 old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
699 } while (assumed != old);
700 return (llint)old;
701 }
702
703 __device__ static inline llint atomicMax(llint *address, llint val)
704 {
705 ullint *address_as_ull = (ullint *)(address);
706 ullint old = *address_as_ull, assumed;
707 do {
708 assumed = old;
709 old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
710 } while (assumed != old);
711 return (llint)old;
712 }
713 #endif
714
715 template <typename Type>
716 struct AtomicMin {
717 __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
718 };
719 template <typename Type>
720 struct AtomicMax {
721 __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
722 };
723
724 /*
725 Atomic bitwise operations
726 As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
727 */
728
729 __device__ static inline llint atomicAnd(llint *address, llint val)
730 {
731 ullint *address_as_ull = (ullint *)(address);
732 ullint old = *address_as_ull, assumed;
733 do {
734 assumed = old;
735 old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
736 } while (assumed != old);
737 return (llint)old;
738 }
739 __device__ static inline llint atomicOr(llint *address, llint val)
740 {
741 ullint *address_as_ull = (ullint *)(address);
742 ullint old = *address_as_ull, assumed;
743 do {
744 assumed = old;
745 old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
746 } while (assumed != old);
747 return (llint)old;
748 }
749
750 __device__ static inline llint atomicXor(llint *address, llint val)
751 {
752 ullint *address_as_ull = (ullint *)(address);
753 ullint old = *address_as_ull, assumed;
754 do {
755 assumed = old;
756 old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
757 } while (assumed != old);
758 return (llint)old;
759 }
760
761 template <typename Type>
762 struct AtomicBAND {
763 __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
764 };
765 template <typename Type>
766 struct AtomicBOR {
767 __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
768 };
769 template <typename Type>
770 struct AtomicBXOR {
771 __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
772 };
773
774 /*
775 Atomic logical operations:
776
777 CUDA has no atomic logical operations at all. We support them on integer types.
778 */
779
780 /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
781 which is what we want since we only support 32-bit and 64-bit integers.
782 */
783 template <typename Type, class Op, int size /* sizeof(Type) */>
784 struct AtomicLogical;
785
786 template <typename Type, class Op>
787 struct AtomicLogical<Type, Op, 4> {
788 __device__ Type operator()(Type &x, Type y) const
789 {
790 int *address_as_int = (int *)(&x);
791 int old = *address_as_int, assumed;
792 Op op;
793 do {
794 assumed = old;
795 old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
796 } while (assumed != old);
797 return (Type)old;
798 }
799 };
800
801 template <typename Type, class Op>
802 struct AtomicLogical<Type, Op, 8> {
803 __device__ Type operator()(Type &x, Type y) const
804 {
805 ullint *address_as_ull = (ullint *)(&x);
806 ullint old = *address_as_ull, assumed;
807 Op op;
808 do {
809 assumed = old;
810 old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
811 } while (assumed != old);
812 return (Type)old;
813 }
814 };
815
816 /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
817 template <typename Type>
818 struct land {
819 __device__ Type operator()(Type x, Type y) { return x && y; }
820 };
821 template <typename Type>
822 struct lor {
823 __device__ Type operator()(Type x, Type y) { return x || y; }
824 };
825 template <typename Type>
826 struct lxor {
827 __device__ Type operator()(Type x, Type y) { return !x != !y; }
828 };
829
830 template <typename Type>
831 struct AtomicLAND {
832 __device__ Type operator()(Type &x, Type y) const
833 {
834 AtomicLogical<Type, land<Type>, sizeof(Type)> op;
835 return op(x, y);
836 }
837 };
838 template <typename Type>
839 struct AtomicLOR {
840 __device__ Type operator()(Type &x, Type y) const
841 {
842 AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
843 return op(x, y);
844 }
845 };
846 template <typename Type>
847 struct AtomicLXOR {
848 __device__ Type operator()(Type &x, Type y) const
849 {
850 AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
851 return op(x, y);
852 }
853 };
854 #endif
855