xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision ca7355308a14805110a7d98dabf1f658d5ec16d5)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #ifndef CEED_CUDA_GEN_TEMPLATES_H
11 #define CEED_CUDA_GEN_TEMPLATES_H
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
21 }
22 
23 //------------------------------------------------------------------------------
24 // 1D
25 //------------------------------------------------------------------------------
26 
27 //------------------------------------------------------------------------------
28 // L-vector -> E-vector, offsets provided
29 //------------------------------------------------------------------------------
30 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
31 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
32                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
33   if (data.t_id_x < P_1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind  = indices[node + elem * P_1d];
36 
37     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
38   }
39 }
40 
41 //------------------------------------------------------------------------------
42 // L-vector -> E-vector, strided
43 //------------------------------------------------------------------------------
44 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
45 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
46   if (data.t_id_x < P_1d) {
47     const CeedInt node = data.t_id_x;
48     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
49 
50     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
51   }
52 }
53 
54 //------------------------------------------------------------------------------
55 // E-vector -> L-vector, offsets provided
56 //------------------------------------------------------------------------------
57 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
58 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
59                                          const CeedScalar *r_v, CeedScalar *d_v) {
60   if (data.t_id_x < P_1d) {
61     const CeedInt node = data.t_id_x;
62     const CeedInt ind  = indices[node + elem * P_1d];
63 
64     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
65   }
66 }
67 
68 //------------------------------------------------------------------------------
69 // E-vector -> L-vector, strided
70 //------------------------------------------------------------------------------
71 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
72 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
73   if (data.t_id_x < P_1d) {
74     const CeedInt node = data.t_id_x;
75     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
76 
77     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 2D
83 //------------------------------------------------------------------------------
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, offsets provided
87 //------------------------------------------------------------------------------
88 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
89 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
90                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
91   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
92     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
93     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
94 
95     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // L-vector -> E-vector, strided
101 //------------------------------------------------------------------------------
102 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
103 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
104   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
105     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
106     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
107 
108     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, offsets provided
114 //------------------------------------------------------------------------------
115 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
116 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
117                                          const CeedScalar *r_v, CeedScalar *d_v) {
118   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
119     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
120     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
121 
122     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
123   }
124 }
125 
126 //------------------------------------------------------------------------------
127 // E-vector -> L-vector, strided
128 //------------------------------------------------------------------------------
129 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
130 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
131   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
132     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
133     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
134 
135     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
136   }
137 }
138 
139 //------------------------------------------------------------------------------
140 // 3D
141 //------------------------------------------------------------------------------
142 
143 //------------------------------------------------------------------------------
144 // L-vector -> E-vector, offsets provided
145 //------------------------------------------------------------------------------
146 // TODO: remove "Dofs" and "Quads" in the following function names?
147 //   - readDofsOffset3d -> readOffset3d ?
148 //   - readDofsStrided3d -> readStrided3d ?
149 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
150 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
151 //   - writeDofsOffset3d -> writeOffset3d ?
152 //   - writeDofsStrided3d -> writeStrided3d ?
153 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
154 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
155                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
156   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
157     for (CeedInt z = 0; z < P_1d; z++) {
158       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
159       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
160 
161       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
162     }
163 }
164 
165 //------------------------------------------------------------------------------
166 // L-vector -> E-vector, strided
167 //------------------------------------------------------------------------------
168 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
169 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
170   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
171     for (CeedInt z = 0; z < P_1d; z++) {
172       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
173       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
174 
175       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
176     }
177 }
178 
179 //------------------------------------------------------------------------------
180 // E-vector -> Q-vector, offests provided
181 //------------------------------------------------------------------------------
182 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
183 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
184                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
185   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
186     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
187     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
188 
189     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
190   }
191 }
192 
193 //------------------------------------------------------------------------------
194 // E-vector -> Q-vector, strided
195 //------------------------------------------------------------------------------
196 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
197 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
198                                                CeedScalar *r_u) {
199   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
200     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
201     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
202 
203     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
204   }
205 }
206 
207 //------------------------------------------------------------------------------
208 // E-vector -> L-vector, offsets provided
209 //------------------------------------------------------------------------------
210 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
211 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
212                                          const CeedScalar *r_v, CeedScalar *d_v) {
213   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
214     for (CeedInt z = 0; z < P_1d; z++) {
215       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
216       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
217 
218       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
219     }
220 }
221 
222 //------------------------------------------------------------------------------
223 // E-vector -> L-vector, strided
224 //------------------------------------------------------------------------------
225 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
226 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
227   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
228     for (CeedInt z = 0; z < P_1d; z++) {
229       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
230       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
231 
232       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
233     }
234 }
235 
236 //------------------------------------------------------------------------------
237 // 3D collocated derivatives computation
238 //------------------------------------------------------------------------------
239 template <int NUM_COMP, int Q_1d>
240 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
241                                    CeedScalar *__restrict__ r_V) {
242   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
243     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
244       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
245       __syncthreads();
246       // X derivative
247       r_V[comp + 0 * NUM_COMP] = 0.0;
248       for (CeedInt i = 0; i < Q_1d; i++)
249         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
250       // Y derivative
251       r_V[comp + 1 * NUM_COMP] = 0.0;
252       for (CeedInt i = 0; i < Q_1d; i++)
253         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
254       // Z derivative
255       r_V[comp + 2 * NUM_COMP] = 0.0;
256       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
257       __syncthreads();
258     }
259   }
260 }
261 
262 //------------------------------------------------------------------------------
263 // 3D collocated derivatives transpose
264 //------------------------------------------------------------------------------
265 template <int NUM_COMP, int Q_1d>
266 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
267                                             CeedScalar *__restrict__ r_V) {
268   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
269     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
270       // X derivative
271       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
272       __syncthreads();
273       for (CeedInt i = 0; i < Q_1d; i++)
274         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
275       __syncthreads();
276       // Y derivative
277       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
278       __syncthreads();
279       for (CeedInt i = 0; i < Q_1d; i++)
280         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
281       __syncthreads();
282       // Z derivative
283       for (CeedInt i = 0; i < Q_1d; i++)
284         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
285     }
286   }
287 }
288 
289 #endif  // CEED_CUDA_GEN_TEMPLATES_H
290