xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 58c07c4fa7bdba34c2b29fbdcd58893d48c3fd9e)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #ifndef CEED_CUDA_GEN_TEMPLATES_H
11 #define CEED_CUDA_GEN_TEMPLATES_H
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
21 }
22 
23 //------------------------------------------------------------------------------
24 // 1D
25 //------------------------------------------------------------------------------
26 
27 //------------------------------------------------------------------------------
28 // L-vector -> E-vector, offsets provided
29 //------------------------------------------------------------------------------
30 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
31 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
32                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
33   if (data.t_id_x < P_1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind  = indices[node + elem * P_1d];
36 
37     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
38   }
39 }
40 
41 //------------------------------------------------------------------------------
42 // L-vector -> E-vector, strided
43 //------------------------------------------------------------------------------
44 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
45 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
46                                          CeedScalar *__restrict__ r_u) {
47   if (data.t_id_x < P_1d) {
48     const CeedInt node = data.t_id_x;
49     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
50 
51     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
52   }
53 }
54 
55 //------------------------------------------------------------------------------
56 // E-vector -> L-vector, offsets provided
57 //------------------------------------------------------------------------------
58 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
59 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
60                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
61   if (data.t_id_x < P_1d) {
62     const CeedInt node = data.t_id_x;
63     const CeedInt ind  = indices[node + elem * P_1d];
64 
65     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // E-vector -> L-vector, strided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
73 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
74                                           CeedScalar *__restrict__ d_v) {
75   if (data.t_id_x < P_1d) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // 2D
85 //------------------------------------------------------------------------------
86 
87 //------------------------------------------------------------------------------
88 // L-vector -> E-vector, offsets provided
89 //------------------------------------------------------------------------------
90 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
91 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
92                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
93   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
94     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
95     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
96 
97     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
98   }
99 }
100 
101 //------------------------------------------------------------------------------
102 // L-vector -> E-vector, strided
103 //------------------------------------------------------------------------------
104 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
105 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
106                                          CeedScalar *__restrict__ r_u) {
107   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
108     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
109     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
110 
111     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
112   }
113 }
114 
115 //------------------------------------------------------------------------------
116 // E-vector -> L-vector, offsets provided
117 //------------------------------------------------------------------------------
118 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
119 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
120                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
121   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
122     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
123     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
124 
125     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
126   }
127 }
128 
129 //------------------------------------------------------------------------------
130 // E-vector -> L-vector, strided
131 //------------------------------------------------------------------------------
132 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
133 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
134                                           CeedScalar *__restrict__ d_v) {
135   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
136     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
137     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
138 
139     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
140   }
141 }
142 
143 //------------------------------------------------------------------------------
144 // 3D
145 //------------------------------------------------------------------------------
146 
147 //------------------------------------------------------------------------------
148 // L-vector -> E-vector, offsets provided
149 //------------------------------------------------------------------------------
150 // TODO: remove "Dofs" and "Quads" in the following function names?
151 //   - readDofsOffset3d -> readOffset3d ?
152 //   - readDofsStrided3d -> readStrided3d ?
153 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
154 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
155 //   - writeDofsOffset3d -> writeOffset3d ?
156 //   - writeDofsStrided3d -> writeStrided3d ?
157 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
158 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
159                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
160   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
161     for (CeedInt z = 0; z < P_1d; z++) {
162       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
163       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
164 
165       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
166     }
167 }
168 
169 //------------------------------------------------------------------------------
170 // L-vector -> E-vector, strided
171 //------------------------------------------------------------------------------
172 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
173 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
174                                          CeedScalar *__restrict__ r_u) {
175   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
176     for (CeedInt z = 0; z < P_1d; z++) {
177       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
178       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
179 
180       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
181     }
182 }
183 
184 //------------------------------------------------------------------------------
185 // E-vector -> Q-vector, offests provided
186 //------------------------------------------------------------------------------
187 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
188 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
189                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
190   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
191     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
192     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
193 
194     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
195   }
196 }
197 
198 //------------------------------------------------------------------------------
199 // E-vector -> Q-vector, strided
200 //------------------------------------------------------------------------------
201 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
202 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
203                                                CeedScalar *__restrict__ r_u) {
204   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
205     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
206     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
207 
208     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
209   }
210 }
211 
212 //------------------------------------------------------------------------------
213 // E-vector -> L-vector, offsets provided
214 //------------------------------------------------------------------------------
215 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
216 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
217                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
218   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
219     for (CeedInt z = 0; z < P_1d; z++) {
220       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
221       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
222 
223       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
224     }
225 }
226 
227 //------------------------------------------------------------------------------
228 // E-vector -> L-vector, strided
229 //------------------------------------------------------------------------------
230 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
231 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
232                                           CeedScalar *__restrict__ d_v) {
233   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
234     for (CeedInt z = 0; z < P_1d; z++) {
235       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
236       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
237 
238       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
239     }
240 }
241 
242 //------------------------------------------------------------------------------
243 // 3D collocated derivatives computation
244 //------------------------------------------------------------------------------
245 template <int NUM_COMP, int Q_1d>
246 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
247                                    CeedScalar *__restrict__ r_V) {
248   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
249     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
250       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
251       __syncthreads();
252       // X derivative
253       r_V[comp + 0 * NUM_COMP] = 0.0;
254       for (CeedInt i = 0; i < Q_1d; i++)
255         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
256       // Y derivative
257       r_V[comp + 1 * NUM_COMP] = 0.0;
258       for (CeedInt i = 0; i < Q_1d; i++)
259         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
260       // Z derivative
261       r_V[comp + 2 * NUM_COMP] = 0.0;
262       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
263       __syncthreads();
264     }
265   }
266 }
267 
268 //------------------------------------------------------------------------------
269 // 3D collocated derivatives transpose
270 //------------------------------------------------------------------------------
271 template <int NUM_COMP, int Q_1d>
272 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
273                                             CeedScalar *__restrict__ r_V) {
274   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
275     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
276       // X derivative
277       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
278       __syncthreads();
279       for (CeedInt i = 0; i < Q_1d; i++)
280         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
281       __syncthreads();
282       // Y derivative
283       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
284       __syncthreads();
285       for (CeedInt i = 0; i < Q_1d; i++)
286         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
287       __syncthreads();
288       // Z derivative
289       for (CeedInt i = 0; i < Q_1d; i++)
290         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
291     }
292   }
293 }
294 
295 #endif  // CEED_CUDA_GEN_TEMPLATES_H
296