xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision bce4db6f154acd10f9bd9018530c044d45753d01)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61   const CeedInt target_comp = n / P_1D;
62   const CeedInt target_node = n % P_1D;
63 
64   if (data.t_id_x == target_node) {
65     r_v[target_comp] = value;
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1D) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = indices[node + elem * P_1D];
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88                                          CeedScalar *__restrict__ r_u) {
89   if (data.t_id_x < P_1D) {
90     const CeedInt node = data.t_id_x;
91     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92 
93     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, offsets provided
99 //------------------------------------------------------------------------------
100 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
103   if (data.t_id_x < P_1D) {
104     const CeedInt node = data.t_id_x;
105     const CeedInt ind  = indices[node + elem * P_1D];
106 
107     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
113                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
114                                                   CeedScalar *__restrict__ d_v) {
115   const CeedInt target_comp = n / P_1D;
116   const CeedInt target_node = n % P_1D;
117 
118   if (data.t_id_x == target_node) {
119     const CeedInt ind = indices[target_node + elem * P_1D];
120 
121     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // E-vector -> L-vector, strided
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
129 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
130                                           CeedScalar *__restrict__ d_v) {
131   if (data.t_id_x < P_1D) {
132     const CeedInt node = data.t_id_x;
133     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
134 
135     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
136   }
137 }
138 
139 //------------------------------------------------------------------------------
140 // 2D
141 //------------------------------------------------------------------------------
142 
143 //------------------------------------------------------------------------------
144 // Set E-vector value
145 //------------------------------------------------------------------------------
146 template <int NUM_COMP, int P_1D>
147 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
148   const CeedInt target_comp   = n / (P_1D * P_1D);
149   const CeedInt target_node_x = n % P_1D;
150   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
151 
152   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
153     r_v[target_comp] = value;
154   }
155 }
156 
157 //------------------------------------------------------------------------------
158 // L-vector -> E-vector, offsets provided
159 //------------------------------------------------------------------------------
160 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
161 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
162                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
163   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
164     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
165     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
166 
167     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
168   }
169 }
170 
171 //------------------------------------------------------------------------------
172 // L-vector -> E-vector, strided
173 //------------------------------------------------------------------------------
174 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
175 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
176                                          CeedScalar *__restrict__ r_u) {
177   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
178     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
179     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
180 
181     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
182   }
183 }
184 
185 //------------------------------------------------------------------------------
186 // E-vector -> L-vector, offsets provided
187 //------------------------------------------------------------------------------
188 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
189 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
190                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
191   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
192     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
193     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
194 
195     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
196   }
197 }
198 
199 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
200 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
201                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
202                                                   CeedScalar *__restrict__ d_v) {
203   const CeedInt target_comp   = n / (P_1D * P_1D);
204   const CeedInt target_node_x = n % P_1D;
205   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
206 
207   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
208     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
209     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
210 
211     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
212   }
213 }
214 
215 //------------------------------------------------------------------------------
216 // E-vector -> L-vector, strided
217 //------------------------------------------------------------------------------
218 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
219 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
220                                           CeedScalar *__restrict__ d_v) {
221   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
222     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
223     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
224 
225     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
226   }
227 }
228 
229 //------------------------------------------------------------------------------
230 // 3D
231 //------------------------------------------------------------------------------
232 
233 //------------------------------------------------------------------------------
234 // Set E-vector value
235 //------------------------------------------------------------------------------
236 template <int NUM_COMP, int P_1D>
237 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
238   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
239   const CeedInt target_node_x = n % P_1D;
240   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
241   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
242 
243   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
244     r_v[target_node_z + target_comp * P_1D] = value;
245   }
246 }
247 
248 //------------------------------------------------------------------------------
249 // L-vector -> E-vector, offsets provided
250 //------------------------------------------------------------------------------
251 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
252 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
253                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
254   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
255     for (CeedInt z = 0; z < P_1D; z++) {
256       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
257       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
258 
259       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
260     }
261   }
262 }
263 
264 //------------------------------------------------------------------------------
265 // L-vector -> E-vector, strided
266 //------------------------------------------------------------------------------
267 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
268 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
269                                          CeedScalar *__restrict__ r_u) {
270   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
271     for (CeedInt z = 0; z < P_1D; z++) {
272       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
273       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
274 
275       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
276     }
277   }
278 }
279 
280 //------------------------------------------------------------------------------
281 // E-vector -> Q-vector, offests provided
282 //------------------------------------------------------------------------------
283 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
284 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
285                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
286                                                CeedScalar *__restrict__ r_u) {
287   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
288     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
289     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
290 
291     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
292   }
293 }
294 
295 //------------------------------------------------------------------------------
296 // E-vector -> Q-vector, strided
297 //------------------------------------------------------------------------------
298 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
299 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
300                                               CeedScalar *__restrict__ r_u) {
301   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
302     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
303     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
304 
305     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
306   }
307 }
308 
309 //------------------------------------------------------------------------------
310 // E-vector -> L-vector, offsets provided
311 //------------------------------------------------------------------------------
312 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
313 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
314                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
315   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
316     for (CeedInt z = 0; z < P_1D; z++) {
317       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
318       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
319 
320       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
321     }
322   }
323 }
324 
325 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
326 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
327                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
328                                                   CeedScalar *__restrict__ d_v) {
329   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
330   const CeedInt target_node_x = n % P_1D;
331   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
332   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
333 
334   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
335     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
336     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
337 
338     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
339   }
340 }
341 
342 //------------------------------------------------------------------------------
343 // E-vector -> L-vector, strided
344 //------------------------------------------------------------------------------
345 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
346 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
347                                           CeedScalar *__restrict__ d_v) {
348   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
349     for (CeedInt z = 0; z < P_1D; z++) {
350       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
351       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
352 
353       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
354     }
355   }
356 }
357 
358 //------------------------------------------------------------------------------
359 // 3D collocated derivatives computation
360 //------------------------------------------------------------------------------
361 template <int NUM_COMP, int Q_1D, int T_1D>
362 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
363                                         CeedScalar *__restrict__ r_V) {
364   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
365     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
366       __syncthreads();
367       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
368       __syncthreads();
369       // X derivative
370       r_V[comp + 0 * NUM_COMP] = 0.0;
371       for (CeedInt i = 0; i < Q_1D; i++) {
372         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
373       }
374       // Y derivative
375       r_V[comp + 1 * NUM_COMP] = 0.0;
376       for (CeedInt i = 0; i < Q_1D; i++) {
377         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
378       }
379       // Z derivative
380       r_V[comp + 2 * NUM_COMP] = 0.0;
381       for (CeedInt i = 0; i < Q_1D; i++) {
382         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
383       }
384     }
385   }
386 }
387 
388 //------------------------------------------------------------------------------
389 // 3D collocated derivatives transpose
390 //------------------------------------------------------------------------------
391 template <int NUM_COMP, int Q_1D, int T_1D>
392 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
393                                                  CeedScalar *__restrict__ r_V) {
394   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
395     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
396       __syncthreads();
397       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
398       __syncthreads();
399       // X derivative
400       for (CeedInt i = 0; i < Q_1D; i++) {
401         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
402       }
403       // Y derivative
404       __syncthreads();
405       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
406       __syncthreads();
407       for (CeedInt i = 0; i < Q_1D; i++) {
408         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
409       }
410       // Z derivative
411       for (CeedInt i = 0; i < Q_1D; i++) {
412         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
413       }
414     }
415   }
416 }
417