xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision bcd92680c8b93d5b8861c3c87761f161cf9f50ef)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // L-vector -> E-vector, offsets provided
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
60 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
61                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
62   if (data.t_id_x < P_1d) {
63     const CeedInt node = data.t_id_x;
64     const CeedInt ind  = indices[node + elem * P_1d];
65 
66     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
67   }
68 }
69 
70 //------------------------------------------------------------------------------
71 // L-vector -> E-vector, strided
72 //------------------------------------------------------------------------------
73 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
74 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
75                                          CeedScalar *__restrict__ r_u) {
76   if (data.t_id_x < P_1d) {
77     const CeedInt node = data.t_id_x;
78     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
79 
80     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // E-vector -> L-vector, offsets provided
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
88 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
89                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
90   if (data.t_id_x < P_1d) {
91     const CeedInt node = data.t_id_x;
92     const CeedInt ind  = indices[node + elem * P_1d];
93 
94     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // E-vector -> L-vector, strided
100 //------------------------------------------------------------------------------
101 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
103                                           CeedScalar *__restrict__ d_v) {
104   if (data.t_id_x < P_1d) {
105     const CeedInt node = data.t_id_x;
106     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
107 
108     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // 2D
114 //------------------------------------------------------------------------------
115 
116 //------------------------------------------------------------------------------
117 // L-vector -> E-vector, offsets provided
118 //------------------------------------------------------------------------------
119 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
120 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
121                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
122   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
123     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
124     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
125 
126     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // L-vector -> E-vector, strided
132 //------------------------------------------------------------------------------
133 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
134 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
135                                          CeedScalar *__restrict__ r_u) {
136   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
137     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
138     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
139 
140     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
141   }
142 }
143 
144 //------------------------------------------------------------------------------
145 // E-vector -> L-vector, offsets provided
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
148 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
149                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
150   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
151     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
152     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
153 
154     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
155   }
156 }
157 
158 //------------------------------------------------------------------------------
159 // E-vector -> L-vector, strided
160 //------------------------------------------------------------------------------
161 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
162 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163                                           CeedScalar *__restrict__ d_v) {
164   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
165     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
166     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
167 
168     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
169   }
170 }
171 
172 //------------------------------------------------------------------------------
173 // 3D
174 //------------------------------------------------------------------------------
175 
176 //------------------------------------------------------------------------------
177 // L-vector -> E-vector, offsets provided
178 //------------------------------------------------------------------------------
179 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
180 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
182   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
183     for (CeedInt z = 0; z < P_1d; z++) {
184       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
185       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
186 
187       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
188     }
189 }
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, strided
193 //------------------------------------------------------------------------------
194 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
195 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
196                                          CeedScalar *__restrict__ r_u) {
197   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
198     for (CeedInt z = 0; z < P_1d; z++) {
199       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
200       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
201 
202       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
203     }
204 }
205 
206 //------------------------------------------------------------------------------
207 // E-vector -> Q-vector, offests provided
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
210 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
211                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
212                                                CeedScalar *__restrict__ r_u) {
213   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
214     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
215     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
216 
217     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
218   }
219 }
220 
221 //------------------------------------------------------------------------------
222 // E-vector -> Q-vector, strided
223 //------------------------------------------------------------------------------
224 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
225 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
226                                               CeedScalar *__restrict__ r_u) {
227   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
228     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
229     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
230 
231     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
232   }
233 }
234 
235 //------------------------------------------------------------------------------
236 // E-vector -> L-vector, offsets provided
237 //------------------------------------------------------------------------------
238 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
239 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
240                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
241   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
242     for (CeedInt z = 0; z < P_1d; z++) {
243       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
244       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
245 
246       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
247     }
248 }
249 
250 //------------------------------------------------------------------------------
251 // E-vector -> L-vector, strided
252 //------------------------------------------------------------------------------
253 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
254 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
255                                           CeedScalar *__restrict__ d_v) {
256   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
257     for (CeedInt z = 0; z < P_1d; z++) {
258       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
259       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
260 
261       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
262     }
263 }
264 
265 //------------------------------------------------------------------------------
266 // 3D collocated derivatives computation
267 //------------------------------------------------------------------------------
268 template <int NUM_COMP, int Q_1d>
269 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
270                                         CeedScalar *__restrict__ r_V) {
271   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
272     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
273       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
274       __syncthreads();
275       // X derivative
276       r_V[comp + 0 * NUM_COMP] = 0.0;
277       for (CeedInt i = 0; i < Q_1d; i++)
278         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
279       // Y derivative
280       r_V[comp + 1 * NUM_COMP] = 0.0;
281       for (CeedInt i = 0; i < Q_1d; i++)
282         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
283       // Z derivative
284       r_V[comp + 2 * NUM_COMP] = 0.0;
285       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
286       __syncthreads();
287     }
288   }
289 }
290 
291 //------------------------------------------------------------------------------
292 // 3D collocated derivatives transpose
293 //------------------------------------------------------------------------------
294 template <int NUM_COMP, int Q_1d>
295 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
296                                                  CeedScalar *__restrict__ r_V) {
297   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
298     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
299       // X derivative
300       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
301       __syncthreads();
302       for (CeedInt i = 0; i < Q_1d; i++)
303         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
304       __syncthreads();
305       // Y derivative
306       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
307       __syncthreads();
308       for (CeedInt i = 0; i < Q_1d; i++)
309         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
310       __syncthreads();
311       // Z derivative
312       for (CeedInt i = 0; i < Q_1d; i++)
313         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
314     }
315   }
316 }
317