xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h (revision 9e1d4b8291fc4f4e19d20cdfdecd866260d0e6d2)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11 
12 #include "hip-shared-basis-read-write-templates.h"
13 #include "hip-shared-basis-tensor-at-points-templates.h"
14 #include "hip-shared-basis-tensor-templates.h"
15 
16 //------------------------------------------------------------------------------
17 // Tensor Basis Kernels AtPoints
18 //------------------------------------------------------------------------------
19 
20 //------------------------------------------------------------------------------
21 // Interp
22 //------------------------------------------------------------------------------
23 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
24     void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
25                         const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
26   extern __shared__ CeedScalar slice[];
27 
28   // load chebyshev_interp_1d into shared memory
29   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
30   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
31   __syncthreads();
32 
33   SharedData_Hip data;
34   data.t_id_x = threadIdx.x;
35   data.t_id_y = threadIdx.y;
36   data.t_id_z = threadIdx.z;
37   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
38   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
39 
40   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
41   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
42   CeedScalar r_V[BASIS_NUM_COMP];
43 
44   // Apply basis element by element
45   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46     // Map to coefficients
47     if (BASIS_DIM == 1) {
48       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
49       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
50     } else if (BASIS_DIM == 2) {
51       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
52       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
53     } else if (BASIS_DIM == 3) {
54       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
55                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
56       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
57     }
58 
59     // Map to points
60     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
61 
62     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
63       const CeedInt p = i % BASIS_NUM_PTS;
64       CeedScalar    r_X[BASIS_DIM];
65 
66       for (CeedInt d = 0; d < BASIS_DIM; d++) {
67         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
68       }
69       if (BASIS_DIM == 1) {
70         InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
71       } else if (BASIS_DIM == 2) {
72         InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
73       } else if (BASIS_DIM == 3) {
74         InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
75       }
76       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
77         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
78       }
79     }
80   }
81 }
82 
83 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
84     void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
85                                  const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
86   extern __shared__ CeedScalar slice[];
87 
88   // load chebyshev_interp_1d into shared memory
89   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
90   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
91   __syncthreads();
92 
93   SharedData_Hip data;
94   data.t_id_x = threadIdx.x;
95   data.t_id_y = threadIdx.y;
96   data.t_id_z = threadIdx.z;
97   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
98   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
99 
100   CeedScalar r_U[BASIS_NUM_COMP];
101   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
102   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
103 
104   // Apply basis element by element
105   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
106     // Clear register
107     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
108 
109     // Map from points
110     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
111 
112     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
113       const CeedInt p = i % BASIS_NUM_PTS;
114       CeedScalar    r_X[BASIS_DIM];
115 
116       for (CeedInt d = 0; d < BASIS_DIM; d++) {
117         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
118       }
119       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
120         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
121         else r_U[j] = 0.0;
122       }
123       if (BASIS_DIM == 1) {
124         InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
125       } else if (BASIS_DIM == 2) {
126         InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
127       } else if (BASIS_DIM == 3) {
128         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
129       }
130     }
131     __syncthreads();
132 
133     // Map from coefficients
134     if (BASIS_DIM == 1) {
135       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
136       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
137     } else if (BASIS_DIM == 2) {
138       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
139       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
140     } else if (BASIS_DIM == 3) {
141       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
142       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
143                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
144     }
145   }
146 }
147 
148 //------------------------------------------------------------------------------
149 // Grad
150 //------------------------------------------------------------------------------
151 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
152     void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
153                       const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
154   extern __shared__ CeedScalar slice[];
155 
156   // load chebyshev_interp_1d into shared memory
157   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
158   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
159   __syncthreads();
160 
161   SharedData_Hip data;
162   data.t_id_x = threadIdx.x;
163   data.t_id_y = threadIdx.y;
164   data.t_id_z = threadIdx.z;
165   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
166   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
167 
168   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
169   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
170   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
171 
172   // Apply basis element by element
173   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
174     // Map to coefficients
175     if (BASIS_DIM == 1) {
176       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
177       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
178     } else if (BASIS_DIM == 2) {
179       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
180       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
181     } else if (BASIS_DIM == 3) {
182       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
183                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
184       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
185     }
186 
187     // Map to points
188     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
189 
190     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
191       const CeedInt p = i % BASIS_NUM_PTS;
192       CeedScalar    r_X[BASIS_DIM];
193 
194       for (CeedInt d = 0; d < BASIS_DIM; d++) {
195         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
196       }
197       if (BASIS_DIM == 1) {
198         GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
199       } else if (BASIS_DIM == 2) {
200         GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
201       } else if (BASIS_DIM == 3) {
202         GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
203       }
204       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
205         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
206       }
207     }
208   }
209 }
210 
211 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
212     void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
213                                const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
214   extern __shared__ CeedScalar slice[];
215 
216   // load chebyshev_interp_1d into shared memory
217   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
218   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
219   __syncthreads();
220 
221   SharedData_Hip data;
222   data.t_id_x = threadIdx.x;
223   data.t_id_y = threadIdx.y;
224   data.t_id_z = threadIdx.z;
225   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
226   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
227 
228   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
229   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
230   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
231 
232   // Apply basis element by element
233   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
234     // Clear register
235     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
236 
237     // Map from points
238     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
239 
240     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
241       const CeedInt p = i % BASIS_NUM_PTS;
242       CeedScalar    r_X[BASIS_DIM];
243 
244       for (CeedInt d = 0; d < BASIS_DIM; d++) {
245         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
246       }
247       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
248         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
249         else r_U[j] = 0.0;
250       }
251       if (BASIS_DIM == 1) {
252         GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
253       } else if (BASIS_DIM == 2) {
254         GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
255       } else if (BASIS_DIM == 3) {
256         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
257       }
258     }
259     __syncthreads();
260 
261     // Map from coefficients
262     if (BASIS_DIM == 1) {
263       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
264       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
265     } else if (BASIS_DIM == 2) {
266       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
267       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
268     } else if (BASIS_DIM == 3) {
269       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
270       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
271                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
272     }
273   }
274 }
275