xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision f4112a4e6eeebf7fbbab43aa04d8ed6ce8965ebf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // 1D
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> E-vector, offsets provided
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
28 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
29                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
30   if (data.t_id_x < P_1d) {
31     const CeedInt node = data.t_id_x;
32     const CeedInt ind  = indices[node + elem * P_1d];
33 
34     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
35   }
36 }
37 
38 //------------------------------------------------------------------------------
39 // L-vector -> E-vector, strided
40 //------------------------------------------------------------------------------
41 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
42 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
43                                          CeedScalar *__restrict__ r_u) {
44   if (data.t_id_x < P_1d) {
45     const CeedInt node = data.t_id_x;
46     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
47 
48     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // E-vector -> L-vector, offsets provided
54 //------------------------------------------------------------------------------
55 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
56 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
57                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
58   if (data.t_id_x < P_1d) {
59     const CeedInt node = data.t_id_x;
60     const CeedInt ind  = indices[node + elem * P_1d];
61 
62     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
63   }
64 }
65 
66 //------------------------------------------------------------------------------
67 // E-vector -> L-vector, strided
68 //------------------------------------------------------------------------------
69 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
70 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
71                                           CeedScalar *__restrict__ d_v) {
72   if (data.t_id_x < P_1d) {
73     const CeedInt node = data.t_id_x;
74     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
75 
76     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 2D
82 //------------------------------------------------------------------------------
83 
84 //------------------------------------------------------------------------------
85 // L-vector -> E-vector, offsets provided
86 //------------------------------------------------------------------------------
87 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
88 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
89                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
90   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
91     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
92     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
93 
94     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // L-vector -> E-vector, strided
100 //------------------------------------------------------------------------------
101 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
103                                          CeedScalar *__restrict__ r_u) {
104   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
105     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
106     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
107 
108     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, offsets provided
114 //------------------------------------------------------------------------------
115 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
116 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
117                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
118   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
119     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
120     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
121 
122     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
123   }
124 }
125 
126 //------------------------------------------------------------------------------
127 // E-vector -> L-vector, strided
128 //------------------------------------------------------------------------------
129 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
130 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
131                                           CeedScalar *__restrict__ d_v) {
132   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
133     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
134     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
135 
136     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
137   }
138 }
139 
140 //------------------------------------------------------------------------------
141 // 3D
142 //------------------------------------------------------------------------------
143 
144 //------------------------------------------------------------------------------
145 // L-vector -> E-vector, offsets provided
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
148 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
149                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
150   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
151     for (CeedInt z = 0; z < P_1d; z++) {
152       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
153       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
154 
155       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
156     }
157 }
158 
159 //------------------------------------------------------------------------------
160 // L-vector -> E-vector, strided
161 //------------------------------------------------------------------------------
162 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
163 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
164                                          CeedScalar *__restrict__ r_u) {
165   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
166     for (CeedInt z = 0; z < P_1d; z++) {
167       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
168       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
169 
170       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
171     }
172 }
173 
174 //------------------------------------------------------------------------------
175 // E-vector -> Q-vector, offests provided
176 //------------------------------------------------------------------------------
177 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
178 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
179                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
180                                                CeedScalar *__restrict__ r_u) {
181   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
182     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
183     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
184 
185     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
186   }
187 }
188 
189 //------------------------------------------------------------------------------
190 // E-vector -> Q-vector, strided
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
193 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
194                                               CeedScalar *__restrict__ r_u) {
195   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
196     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
197     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
198 
199     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
200   }
201 }
202 
203 //------------------------------------------------------------------------------
204 // E-vector -> L-vector, offsets provided
205 //------------------------------------------------------------------------------
206 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
207 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
208                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
209   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
210     for (CeedInt z = 0; z < P_1d; z++) {
211       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
212       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
213 
214       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
215     }
216 }
217 
218 //------------------------------------------------------------------------------
219 // E-vector -> L-vector, strided
220 //------------------------------------------------------------------------------
221 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
222 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
223                                           CeedScalar *__restrict__ d_v) {
224   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
225     for (CeedInt z = 0; z < P_1d; z++) {
226       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
227       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
228 
229       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
230     }
231 }
232 
233 //------------------------------------------------------------------------------
234 // 3D collocated derivatives computation
235 //------------------------------------------------------------------------------
236 template <int NUM_COMP, int Q_1d>
237 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
238                                         CeedScalar *__restrict__ r_V) {
239   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
240     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
241       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
242       __syncthreads();
243       // X derivative
244       r_V[comp + 0 * NUM_COMP] = 0.0;
245       for (CeedInt i = 0; i < Q_1d; i++)
246         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
247       // Y derivative
248       r_V[comp + 1 * NUM_COMP] = 0.0;
249       for (CeedInt i = 0; i < Q_1d; i++)
250         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
251       // Z derivative
252       r_V[comp + 2 * NUM_COMP] = 0.0;
253       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
254       __syncthreads();
255     }
256   }
257 }
258 
259 //------------------------------------------------------------------------------
260 // 3D collocated derivatives transpose
261 //------------------------------------------------------------------------------
262 template <int NUM_COMP, int Q_1d>
263 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
264                                                  CeedScalar *__restrict__ r_V) {
265   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
266     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
267       // X derivative
268       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
269       __syncthreads();
270       for (CeedInt i = 0; i < Q_1d; i++)
271         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
272       __syncthreads();
273       // Y derivative
274       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
275       __syncthreads();
276       for (CeedInt i = 0; i < Q_1d; i++)
277         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
278       __syncthreads();
279       // Z derivative
280       for (CeedInt i = 0; i < Q_1d; i++)
281         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
282     }
283   }
284 }
285