xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision caee03026e6576cbf7a399c2fc51bb918c77f451)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #ifndef _ceed_cuda_gen_templates_h
11 #define _ceed_cuda_gen_templates_h
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
21 }
22 
23 //------------------------------------------------------------------------------
24 // 1D
25 //------------------------------------------------------------------------------
26 
27 //------------------------------------------------------------------------------
28 // L-vector -> E-vector, offsets provided
29 //------------------------------------------------------------------------------
30 template <int NCOMP, int COMPSTRIDE, int P1d>
31 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
32                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
33   if (data.t_id_x < P1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind  = indices[node + elem * P1d];
36     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
37   }
38 }
39 
40 //------------------------------------------------------------------------------
41 // L-vector -> E-vector, strided
42 //------------------------------------------------------------------------------
43 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
44 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
45   if (data.t_id_x < P1d) {
46     const CeedInt node = data.t_id_x;
47     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
48     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // E-vector -> L-vector, offsets provided
54 //------------------------------------------------------------------------------
55 template <int NCOMP, int COMPSTRIDE, int P1d>
56 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
57                                          const CeedScalar *r_v, CeedScalar *d_v) {
58   if (data.t_id_x < P1d) {
59     const CeedInt node = data.t_id_x;
60     const CeedInt ind  = indices[node + elem * P1d];
61     for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
62   }
63 }
64 
65 //------------------------------------------------------------------------------
66 // E-vector -> L-vector, strided
67 //------------------------------------------------------------------------------
68 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
69 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
70   if (data.t_id_x < P1d) {
71     const CeedInt node = data.t_id_x;
72     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
73     for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
74   }
75 }
76 
77 //------------------------------------------------------------------------------
78 // 2D
79 //------------------------------------------------------------------------------
80 
81 //------------------------------------------------------------------------------
82 // L-vector -> E-vector, offsets provided
83 //------------------------------------------------------------------------------
84 template <int NCOMP, int COMPSTRIDE, int P1d>
85 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
86                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
87   if (data.t_id_x < P1d && data.t_id_y < P1d) {
88     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
89     const CeedInt ind  = indices[node + elem * P1d * P1d];
90     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
91   }
92 }
93 
94 //------------------------------------------------------------------------------
95 // L-vector -> E-vector, strided
96 //------------------------------------------------------------------------------
97 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
98 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
99   if (data.t_id_x < P1d && data.t_id_y < P1d) {
100     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
101     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
102     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
103   }
104 }
105 
106 //------------------------------------------------------------------------------
107 // E-vector -> L-vector, offsets provided
108 //------------------------------------------------------------------------------
109 template <int NCOMP, int COMPSTRIDE, int P1d>
110 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
111                                          const CeedScalar *r_v, CeedScalar *d_v) {
112   if (data.t_id_x < P1d && data.t_id_y < P1d) {
113     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
114     const CeedInt ind  = indices[node + elem * P1d * P1d];
115     for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // E-vector -> L-vector, strided
121 //------------------------------------------------------------------------------
122 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
123 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
124   if (data.t_id_x < P1d && data.t_id_y < P1d) {
125     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
126     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
127     for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
128   }
129 }
130 
131 //------------------------------------------------------------------------------
132 // 3D
133 //------------------------------------------------------------------------------
134 
135 //------------------------------------------------------------------------------
136 // L-vector -> E-vector, offsets provided
137 //------------------------------------------------------------------------------
138 // TODO: remove "Dofs" and "Quads" in the following function names?
139 //   - readDofsOffset3d -> readOffset3d ?
140 //   - readDofsStrided3d -> readStrided3d ?
141 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
142 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
143 //   - writeDofsOffset3d -> writeOffset3d ?
144 //   - writeDofsStrided3d -> writeStrided3d ?
145 template <int NCOMP, int COMPSTRIDE, int P1d>
146 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
147                                         const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
148   if (data.t_id_x < P1d && data.t_id_y < P1d)
149     for (CeedInt z = 0; z < P1d; ++z) {
150       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
151       const CeedInt ind  = indices[node + elem * P1d * P1d * P1d];
152       for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp];
153     }
154 }
155 
156 //------------------------------------------------------------------------------
157 // L-vector -> E-vector, strided
158 //------------------------------------------------------------------------------
159 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
160 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
161   if (data.t_id_x < P1d && data.t_id_y < P1d)
162     for (CeedInt z = 0; z < P1d; ++z) {
163       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
164       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
165       for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + comp * STRIDES_COMP];
166     }
167 }
168 
169 //------------------------------------------------------------------------------
170 // E-vector -> Q-vector, offests provided
171 //------------------------------------------------------------------------------
172 template <int NCOMP, int COMPSTRIDE, int Q1d>
173 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
174                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
175   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
176     const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d;
177     const CeedInt ind  = indices[node + elem * Q1d * Q1d * Q1d];
178     ;
179     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
180   }
181 }
182 
183 //------------------------------------------------------------------------------
184 // E-vector -> Q-vector, strided
185 //------------------------------------------------------------------------------
186 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
187 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
188                                                CeedScalar *r_u) {
189   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
190     const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d;
191     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
192     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
193   }
194 }
195 
196 //------------------------------------------------------------------------------
197 // E-vector -> L-vector, offsets provided
198 //------------------------------------------------------------------------------
199 template <int NCOMP, int COMPSTRIDE, int P1d>
200 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices,
201                                          const CeedScalar *r_v, CeedScalar *d_v) {
202   if (data.t_id_x < P1d && data.t_id_y < P1d)
203     for (CeedInt z = 0; z < P1d; ++z) {
204       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
205       const CeedInt ind  = indices[node + elem * P1d * P1d * P1d];
206       for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]);
207     }
208 }
209 
210 //------------------------------------------------------------------------------
211 // E-vector -> L-vector, strided
212 //------------------------------------------------------------------------------
213 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
214 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
215   if (data.t_id_x < P1d && data.t_id_y < P1d)
216     for (CeedInt z = 0; z < P1d; ++z) {
217       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
218       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
219       for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P1d];
220     }
221 }
222 
223 //------------------------------------------------------------------------------
224 // 3D collocated derivatives computation
225 //------------------------------------------------------------------------------
226 template <int NCOMP, int Q1d>
227 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
228                                    CeedScalar *__restrict__ r_V) {
229   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
230     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
231       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d];
232       __syncthreads();
233       // X derivative
234       r_V[comp + 0 * NCOMP] = 0.0;
235       for (CeedInt i = 0; i < Q1d; ++i)
236         r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
237       // Y derivative
238       r_V[comp + 1 * NCOMP] = 0.0;
239       for (CeedInt i = 0; i < Q1d; ++i)
240         r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
241       // Z derivative
242       r_V[comp + 2 * NCOMP] = 0.0;
243       for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d];  // Contract z direction (Z derivative)
244       __syncthreads();
245     }
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // 3D collocated derivatives transpose
251 //------------------------------------------------------------------------------
252 template <int NCOMP, int Q1d>
253 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
254                                             CeedScalar *__restrict__ r_V) {
255   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
256     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
257       // X derivative
258       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP];
259       __syncthreads();
260       for (CeedInt i = 0; i < Q1d; ++i)
261         r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
262       __syncthreads();
263       // Y derivative
264       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP];
265       __syncthreads();
266       for (CeedInt i = 0; i < Q1d; ++i)
267         r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
268       __syncthreads();
269       // Z derivative
270       for (CeedInt i = 0; i < Q1d; ++i)
271         r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP];  // PARTIAL contract z direction (Z derivative)
272     }
273   }
274 }
275 
276 #endif
277