xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision d275d636ccaa61e594421fac80252590e7a77ccf)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // L-vector -> E-vector, offsets provided
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
60 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
61                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
62   if (data.t_id_x < P_1D) {
63     const CeedInt node = data.t_id_x;
64     const CeedInt ind  = indices[node + elem * P_1D];
65 
66     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
67   }
68 }
69 
70 //------------------------------------------------------------------------------
71 // L-vector -> E-vector, strided
72 //------------------------------------------------------------------------------
73 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
74 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
75                                          CeedScalar *__restrict__ r_u) {
76   if (data.t_id_x < P_1D) {
77     const CeedInt node = data.t_id_x;
78     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
79 
80     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // E-vector -> L-vector, offsets provided
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
88 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
89                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
90   if (data.t_id_x < P_1D) {
91     const CeedInt node = data.t_id_x;
92     const CeedInt ind  = indices[node + elem * P_1D];
93 
94     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // E-vector -> L-vector, strided
100 //------------------------------------------------------------------------------
101 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
103                                           CeedScalar *__restrict__ d_v) {
104   if (data.t_id_x < P_1D) {
105     const CeedInt node = data.t_id_x;
106     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
107 
108     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // 2D
114 //------------------------------------------------------------------------------
115 
116 //------------------------------------------------------------------------------
117 // L-vector -> E-vector, offsets provided
118 //------------------------------------------------------------------------------
119 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
120 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
121                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
122   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
123     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
124     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
125 
126     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // L-vector -> E-vector, strided
132 //------------------------------------------------------------------------------
133 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
134 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
135                                          CeedScalar *__restrict__ r_u) {
136   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
137     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
138     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
139 
140     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
141   }
142 }
143 
144 //------------------------------------------------------------------------------
145 // E-vector -> L-vector, offsets provided
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
148 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
149                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
150   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
151     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
152     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
153 
154     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
155   }
156 }
157 
158 //------------------------------------------------------------------------------
159 // E-vector -> L-vector, strided
160 //------------------------------------------------------------------------------
161 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
162 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163                                           CeedScalar *__restrict__ d_v) {
164   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
165     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
166     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
167 
168     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
169   }
170 }
171 
172 //------------------------------------------------------------------------------
173 // 3D
174 //------------------------------------------------------------------------------
175 
176 //------------------------------------------------------------------------------
177 // L-vector -> E-vector, offsets provided
178 //------------------------------------------------------------------------------
179 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
180 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
182   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183     for (CeedInt z = 0; z < P_1D; z++) {
184       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
185       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
186 
187       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
188     }
189   }
190 }
191 
192 //------------------------------------------------------------------------------
193 // L-vector -> E-vector, strided
194 //------------------------------------------------------------------------------
195 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
196 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
197                                          CeedScalar *__restrict__ r_u) {
198   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
199     for (CeedInt z = 0; z < P_1D; z++) {
200       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
201       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
202 
203       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
204     }
205   }
206 }
207 
208 //------------------------------------------------------------------------------
209 // E-vector -> Q-vector, offests provided
210 //------------------------------------------------------------------------------
211 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
212 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
213                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
214                                                CeedScalar *__restrict__ r_u) {
215   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
216     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
217     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
218 
219     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
220   }
221 }
222 
223 //------------------------------------------------------------------------------
224 // E-vector -> Q-vector, strided
225 //------------------------------------------------------------------------------
226 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
227 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
228                                               CeedScalar *__restrict__ r_u) {
229   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
230     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
231     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
232 
233     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, offsets provided
239 //------------------------------------------------------------------------------
240 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
241 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
242                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
243   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
244     for (CeedInt z = 0; z < P_1D; z++) {
245       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
246       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
247 
248       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
249     }
250   }
251 }
252 
253 //------------------------------------------------------------------------------
254 // E-vector -> L-vector, strided
255 //------------------------------------------------------------------------------
256 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
257 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
258                                           CeedScalar *__restrict__ d_v) {
259   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
260     for (CeedInt z = 0; z < P_1D; z++) {
261       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
262       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
263 
264       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
265     }
266   }
267 }
268 
269 //------------------------------------------------------------------------------
270 // 3D collocated derivatives computation
271 //------------------------------------------------------------------------------
272 template <int NUM_COMP, int Q_1D, int T_1D>
273 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
274                                         CeedScalar *__restrict__ r_V) {
275   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
276     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
277       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
278       __syncthreads();
279       // X derivative
280       r_V[comp + 0 * NUM_COMP] = 0.0;
281       for (CeedInt i = 0; i < Q_1D; i++) {
282         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
283       }
284       // Y derivative
285       r_V[comp + 1 * NUM_COMP] = 0.0;
286       for (CeedInt i = 0; i < Q_1D; i++) {
287         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
288       }
289       // Z derivative
290       r_V[comp + 2 * NUM_COMP] = 0.0;
291       for (CeedInt i = 0; i < Q_1D; i++) {
292         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
293       }
294       __syncthreads();
295     }
296   }
297 }
298 
299 //------------------------------------------------------------------------------
300 // 3D collocated derivatives transpose
301 //------------------------------------------------------------------------------
302 template <int NUM_COMP, int Q_1D, int T_1D>
303 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
304                                                  CeedScalar *__restrict__ r_V) {
305   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
306     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
307       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
308       __syncthreads();
309       // X derivative
310       for (CeedInt i = 0; i < Q_1D; i++) {
311         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
312       }
313       __syncthreads();
314       // Y derivative
315       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
316       __syncthreads();
317       for (CeedInt i = 0; i < Q_1D; i++) {
318         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
319       }
320       __syncthreads();
321       // Z derivative
322       for (CeedInt i = 0; i < Q_1D; i++) {
323         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
324       }
325     }
326   }
327 }
328