xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h (revision 5f954c19a7f1ef8f2e1bd141b22369f85709825b)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11 
12 #include "hip-shared-basis-read-write-templates.h"
13 #include "hip-shared-basis-tensor-at-points-templates.h"
14 #include "hip-shared-basis-tensor-templates.h"
15 
16 //------------------------------------------------------------------------------
17 // Tensor Basis Kernels AtPoints
18 //------------------------------------------------------------------------------
19 
20 //------------------------------------------------------------------------------
21 // Interp
22 //------------------------------------------------------------------------------
23 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
24     void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
25                         const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
26   extern __shared__ CeedScalar slice[];
27 
28   // load chebyshev_interp_1d into shared memory
29   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
30   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
31   __syncthreads();
32 
33   SharedData_Hip data;
34   data.t_id_x = threadIdx.x;
35   data.t_id_y = threadIdx.y;
36   data.t_id_z = threadIdx.z;
37   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
38   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
39 
40   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
41   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
42   CeedScalar r_V[BASIS_NUM_COMP];
43 
44   // Apply basis element by element
45   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46     // Map to coefficients
47     if (BASIS_DIM == 1) {
48       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
49       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
50     } else if (BASIS_DIM == 2) {
51       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
52       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
53     } else if (BASIS_DIM == 3) {
54       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
55                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
56       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
57     }
58 
59     // Map to points
60     InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
61                                                                          &d_V[elem * BASIS_NUM_PTS]);
62   }
63 }
64 
65 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
66     void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
67                                  const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
68   extern __shared__ CeedScalar slice[];
69 
70   // load chebyshev_interp_1d into shared memory
71   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
72   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
73   __syncthreads();
74 
75   SharedData_Hip data;
76   data.t_id_x = threadIdx.x;
77   data.t_id_y = threadIdx.y;
78   data.t_id_z = threadIdx.z;
79   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
80   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
81 
82   CeedScalar r_U[BASIS_NUM_COMP];
83   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
84   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
85 
86   // Apply basis element by element
87   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
88     // Map from points
89     InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
90                                                                                   &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
91 
92     // Map from coefficients
93     if (BASIS_DIM == 1) {
94       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
95       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
96     } else if (BASIS_DIM == 2) {
97       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
98       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
99     } else if (BASIS_DIM == 3) {
100       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
101       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
102                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
103     }
104   }
105 }
106 
107 //------------------------------------------------------------------------------
108 // Grad
109 //------------------------------------------------------------------------------
110 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
111     void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
112                       const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
113   extern __shared__ CeedScalar slice[];
114 
115   // load chebyshev_interp_1d into shared memory
116   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
117   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
118   __syncthreads();
119 
120   SharedData_Hip data;
121   data.t_id_x = threadIdx.x;
122   data.t_id_y = threadIdx.y;
123   data.t_id_z = threadIdx.z;
124   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
125   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
126 
127   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
128   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
129   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
130 
131   // Apply basis element by element
132   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
133     // Map to coefficients
134     if (BASIS_DIM == 1) {
135       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
136       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
137     } else if (BASIS_DIM == 2) {
138       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
139       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
140     } else if (BASIS_DIM == 3) {
141       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
142                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
143       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
144     }
145 
146     // Map to points
147     GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
148                                                                        &d_V[elem * BASIS_NUM_PTS]);
149   }
150 }
151 
152 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
153     void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
154                                const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
155   extern __shared__ CeedScalar slice[];
156 
157   // load chebyshev_interp_1d into shared memory
158   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
159   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
160   __syncthreads();
161 
162   SharedData_Hip data;
163   data.t_id_x = threadIdx.x;
164   data.t_id_y = threadIdx.y;
165   data.t_id_z = threadIdx.z;
166   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
167   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
168 
169   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
170   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
171   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
172 
173   // Apply basis element by element
174   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
175     // Map from points
176     GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
177                                                                                 &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
178 
179     // Map from coefficients
180     if (BASIS_DIM == 1) {
181       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
182       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
183     } else if (BASIS_DIM == 2) {
184       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
185       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
186     } else if (BASIS_DIM == 3) {
187       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
188       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
189                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
190     }
191   }
192 }
193