xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 5ebd836c59d60a2e5e1cb67f6731404c7da26f85)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // Load matrices for basis actions
15 //------------------------------------------------------------------------------
16 template <int P, int Q>
17 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
18   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
19 }
20 
21 //------------------------------------------------------------------------------
22 // 1D
23 //------------------------------------------------------------------------------
24 
25 //------------------------------------------------------------------------------
26 // L-vector -> E-vector, offsets provided
27 //------------------------------------------------------------------------------
28 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
29 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
30                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
31   if (data.t_id_x < P_1d) {
32     const CeedInt node = data.t_id_x;
33     const CeedInt ind  = indices[node + elem * P_1d];
34 
35     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
36   }
37 }
38 
39 //------------------------------------------------------------------------------
40 // L-vector -> E-vector, strided
41 //------------------------------------------------------------------------------
42 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
43 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
44                                          CeedScalar *__restrict__ r_u) {
45   if (data.t_id_x < P_1d) {
46     const CeedInt node = data.t_id_x;
47     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
48 
49     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
50   }
51 }
52 
53 //------------------------------------------------------------------------------
54 // E-vector -> L-vector, offsets provided
55 //------------------------------------------------------------------------------
56 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
57 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
58                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
59   if (data.t_id_x < P_1d) {
60     const CeedInt node = data.t_id_x;
61     const CeedInt ind  = indices[node + elem * P_1d];
62 
63     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // E-vector -> L-vector, strided
69 //------------------------------------------------------------------------------
70 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
71 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
72                                           CeedScalar *__restrict__ d_v) {
73   if (data.t_id_x < P_1d) {
74     const CeedInt node = data.t_id_x;
75     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
76 
77     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 2D
83 //------------------------------------------------------------------------------
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, offsets provided
87 //------------------------------------------------------------------------------
88 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
89 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
90                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
91   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
92     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
93     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
94 
95     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // L-vector -> E-vector, strided
101 //------------------------------------------------------------------------------
102 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
103 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
104                                          CeedScalar *__restrict__ r_u) {
105   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
106     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
107     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
108 
109     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // E-vector -> L-vector, offsets provided
115 //------------------------------------------------------------------------------
116 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
117 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
118                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
119   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
120     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
121     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
122 
123     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // E-vector -> L-vector, strided
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
131 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
132                                           CeedScalar *__restrict__ d_v) {
133   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
134     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
135     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
136 
137     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
138   }
139 }
140 
141 //------------------------------------------------------------------------------
142 // 3D
143 //------------------------------------------------------------------------------
144 
145 //------------------------------------------------------------------------------
146 // L-vector -> E-vector, offsets provided
147 //------------------------------------------------------------------------------
148 // TODO: remove "Dofs" and "Quads" in the following function names?
149 //   - readDofsOffset3d -> readOffset3d ?
150 //   - readDofsStrided3d -> readStrided3d ?
151 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
152 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
153 //   - writeDofsOffset3d -> writeOffset3d ?
154 //   - writeDofsStrided3d -> writeStrided3d ?
155 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
156 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
157                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
158   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
159     for (CeedInt z = 0; z < P_1d; z++) {
160       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
161       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
162 
163       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
164     }
165 }
166 
167 //------------------------------------------------------------------------------
168 // L-vector -> E-vector, strided
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
171 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
172                                          CeedScalar *__restrict__ r_u) {
173   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
174     for (CeedInt z = 0; z < P_1d; z++) {
175       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
176       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
177 
178       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
179     }
180 }
181 
182 //------------------------------------------------------------------------------
183 // E-vector -> Q-vector, offests provided
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
186 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
187                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
188   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
189     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
190     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
191 
192     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
193   }
194 }
195 
196 //------------------------------------------------------------------------------
197 // E-vector -> Q-vector, strided
198 //------------------------------------------------------------------------------
199 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
200 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
201                                                CeedScalar *__restrict__ r_u) {
202   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
203     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
204     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
205 
206     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
207   }
208 }
209 
210 //------------------------------------------------------------------------------
211 // E-vector -> L-vector, offsets provided
212 //------------------------------------------------------------------------------
213 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
214 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
215                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
216   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
217     for (CeedInt z = 0; z < P_1d; z++) {
218       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
219       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
220 
221       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
222     }
223 }
224 
225 //------------------------------------------------------------------------------
226 // E-vector -> L-vector, strided
227 //------------------------------------------------------------------------------
228 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
229 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
230                                           CeedScalar *__restrict__ d_v) {
231   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
232     for (CeedInt z = 0; z < P_1d; z++) {
233       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
234       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
235 
236       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
237     }
238 }
239 
240 //------------------------------------------------------------------------------
241 // 3D collocated derivatives computation
242 //------------------------------------------------------------------------------
243 template <int NUM_COMP, int Q_1d>
244 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
245                                    CeedScalar *__restrict__ r_V) {
246   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
247     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
249       __syncthreads();
250       // X derivative
251       r_V[comp + 0 * NUM_COMP] = 0.0;
252       for (CeedInt i = 0; i < Q_1d; i++)
253         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
254       // Y derivative
255       r_V[comp + 1 * NUM_COMP] = 0.0;
256       for (CeedInt i = 0; i < Q_1d; i++)
257         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
258       // Z derivative
259       r_V[comp + 2 * NUM_COMP] = 0.0;
260       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
261       __syncthreads();
262     }
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // 3D collocated derivatives transpose
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int Q_1d>
270 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
271                                             CeedScalar *__restrict__ r_V) {
272   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
273     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
274       // X derivative
275       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
276       __syncthreads();
277       for (CeedInt i = 0; i < Q_1d; i++)
278         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
279       __syncthreads();
280       // Y derivative
281       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
282       __syncthreads();
283       for (CeedInt i = 0; i < Q_1d; i++)
284         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
285       __syncthreads();
286       // Z derivative
287       for (CeedInt i = 0; i < Q_1d; i++)
288         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
289     }
290   }
291 }
292