xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h (revision 9bc663991d6482bcb1d60b1f116148f11db83fa1)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP non-tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Tensor contraction
14 //------------------------------------------------------------------------------
15 template <int NUM_COMP, int Q_COMP, int P, int Q>
16 inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
17                                 const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B,
18                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
19   const CeedInt     t_id = threadIdx.x;
20   const CeedScalar *U;
21   CeedScalar        r_V[Q_COMP];
22   // TODO load B in shared memory if blockDim.z > 1?
23 
24   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
25     // Run with Q threads
26     U = &d_U[elem * strides_elem_U + comp * strides_comp_U];
27     for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0;
28     for (CeedInt i = 0; i < P; i++) {
29       const CeedScalar val = U[i];
30 
31       for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val;
32     }
33     for (CeedInt d = 0; d < Q_COMP; d++) {
34       d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d];
35     }
36   }
37 }
38 
39 //------------------------------------------------------------------------------
40 // Tensor contraction transpose
41 //------------------------------------------------------------------------------
42 template <int NUM_COMP, int Q_COMP, int P, int Q>
43 inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U,
44                                          const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B,
45                                          const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
46   const CeedInt     t_id = threadIdx.x;
47   const CeedScalar *U;
48   CeedScalar        r_V;
49   // TODO load B in shared memory if blockDim.z > 1?
50 
51   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
52     // Run with P threads
53     r_V = 0.0;
54     for (CeedInt d = 0; d < Q_COMP; d++) {
55       U = &d_U[elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U];
56       for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i];
57     }
58     d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V;
59   }
60 }
61