xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision f1c40530bc6f02bc6ca847af718b8b41cc3fd3ca)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #ifndef _ceed_cuda_gen_templates_h
11 #define _ceed_cuda_gen_templates_h
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20   for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
21     B[i] = d_B[i];
22 }
23 
24 //------------------------------------------------------------------------------
25 // 1D
26 //------------------------------------------------------------------------------
27 
28 //------------------------------------------------------------------------------
29 // L-vector -> E-vector, offsets provided
30 //------------------------------------------------------------------------------
31 template <int NCOMP, int COMPSTRIDE, int P1d>
32 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
33   if (data.t_id_x < P1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind = indices[node + elem * P1d];
36     for (CeedInt comp = 0; comp < NCOMP; ++comp)
37       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
38   }
39 }
40 
41 //------------------------------------------------------------------------------
42 // L-vector -> E-vector, strided
43 //------------------------------------------------------------------------------
44 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
45 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
46   if (data.t_id_x < P1d) {
47     const CeedInt node = data.t_id_x;
48     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
49     for (CeedInt comp = 0; comp < NCOMP; ++comp)
50       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
51   }
52 }
53 
54 //------------------------------------------------------------------------------
55 // E-vector -> L-vector, offsets provided
56 //------------------------------------------------------------------------------
57 template <int NCOMP, int COMPSTRIDE, int P1d>
58 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
59   if (data.t_id_x < P1d) {
60     const CeedInt node = data.t_id_x;
61     const CeedInt ind = indices[node + elem * P1d];
62     for (CeedInt comp = 0; comp < NCOMP; ++comp)
63       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // E-vector -> L-vector, strided
69 //------------------------------------------------------------------------------
70 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
71 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
72   if (data.t_id_x < P1d) {
73     const CeedInt node = data.t_id_x;
74     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
75     for (CeedInt comp = 0; comp < NCOMP; ++comp)
76       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 2D
82 //------------------------------------------------------------------------------
83 
84 //------------------------------------------------------------------------------
85 // L-vector -> E-vector, offsets provided
86 //------------------------------------------------------------------------------
87 template <int NCOMP, int COMPSTRIDE, int P1d>
88 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
89   if (data.t_id_x < P1d && data.t_id_y < P1d) {
90     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
91     const CeedInt ind = indices[node + elem * P1d*P1d];
92     for (CeedInt comp = 0; comp < NCOMP; ++comp)
93       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // L-vector -> E-vector, strided
99 //------------------------------------------------------------------------------
100 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
101 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
102   if (data.t_id_x < P1d && data.t_id_y < P1d) {
103     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
104     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
105     for (CeedInt comp = 0; comp < NCOMP; ++comp)
106       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
107   }
108 }
109 
110 //------------------------------------------------------------------------------
111 // E-vector -> L-vector, offsets provided
112 //------------------------------------------------------------------------------
113 template <int NCOMP, int COMPSTRIDE, int P1d>
114 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
115   if (data.t_id_x < P1d && data.t_id_y < P1d) {
116     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
117     const CeedInt ind = indices[node + elem * P1d*P1d];
118     for (CeedInt comp = 0; comp < NCOMP; ++comp)
119       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
120   }
121 }
122 
123 //------------------------------------------------------------------------------
124 // E-vector -> L-vector, strided
125 //------------------------------------------------------------------------------
126 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
127 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
128   if (data.t_id_x < P1d && data.t_id_y < P1d) {
129     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
130     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
131     for (CeedInt comp = 0; comp < NCOMP; ++comp)
132       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
133   }
134 }
135 
136 //------------------------------------------------------------------------------
137 // 3D
138 //------------------------------------------------------------------------------
139 
140 //------------------------------------------------------------------------------
141 // L-vector -> E-vector, offsets provided
142 //------------------------------------------------------------------------------
143 // TODO: remove "Dofs" and "Quads" in the following function names?
144 //   - readDofsOffset3d -> readOffset3d ?
145 //   - readDofsStrided3d -> readStrided3d ?
146 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
147 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
148 //   - writeDofsOffset3d -> writeOffset3d ?
149 //   - writeDofsStrided3d -> writeStrided3d ?
150 template <int NCOMP, int COMPSTRIDE, int P1d>
151 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
152   if (data.t_id_x < P1d && data.t_id_y < P1d)
153     for (CeedInt z = 0; z < P1d; ++z) {
154       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
155       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
156       for (CeedInt comp = 0; comp < NCOMP; ++comp)
157         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
158     }
159 }
160 
161 //------------------------------------------------------------------------------
162 // L-vector -> E-vector, strided
163 //------------------------------------------------------------------------------
164 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
165 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
166   if (data.t_id_x < P1d && data.t_id_y < P1d)
167     for (CeedInt z = 0; z < P1d; ++z) {
168       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
169       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
170       for (CeedInt comp = 0; comp < NCOMP; ++comp)
171         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
172     }
173 }
174 
175 //------------------------------------------------------------------------------
176 // E-vector -> Q-vector, offests provided
177 //------------------------------------------------------------------------------
178 template <int NCOMP, int COMPSTRIDE, int Q1d>
179 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
180   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
181     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
182     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
183     for (CeedInt comp = 0; comp < NCOMP; ++comp)
184       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
185   }
186 }
187 
188 //------------------------------------------------------------------------------
189 // E-vector -> Q-vector, strided
190 //------------------------------------------------------------------------------
191 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
192 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
193   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
194     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
195     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
196     for (CeedInt comp = 0; comp < NCOMP; ++comp)
197       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
198   }
199 }
200 
201 //------------------------------------------------------------------------------
202 // E-vector -> L-vector, offsets provided
203 //------------------------------------------------------------------------------
204 template <int NCOMP, int COMPSTRIDE, int P1d>
205 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
206   if (data.t_id_x < P1d && data.t_id_y < P1d)
207     for (CeedInt z = 0; z < P1d; ++z) {
208       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
209       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
210       for (CeedInt comp = 0; comp < NCOMP; ++comp)
211         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
212     }
213 }
214 
215 //------------------------------------------------------------------------------
216 // E-vector -> L-vector, strided
217 //------------------------------------------------------------------------------
218 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
219 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
220   if (data.t_id_x < P1d && data.t_id_y < P1d)
221     for (CeedInt z = 0; z < P1d; ++z) {
222       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
223       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
224       for (CeedInt comp = 0; comp < NCOMP; ++comp)
225         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
226     }
227 }
228 
229 //------------------------------------------------------------------------------
230 // 3D collocated derivatives computation
231 //------------------------------------------------------------------------------
232 template <int NCOMP, int Q1d>
233 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
234   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
235     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
236       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d];
237       __syncthreads();
238       // X derivative
239       r_V[comp+0*NCOMP] = 0.0;
240       for (CeedInt i = 0; i < Q1d; ++i)
241         r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
242       // Y derivative
243       r_V[comp+1*NCOMP] = 0.0;
244       for (CeedInt i = 0; i < Q1d; ++i)
245         r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
246       // Z derivative
247       r_V[comp+2*NCOMP] = 0.0;
248       for (CeedInt i = 0; i < Q1d; ++i)
249         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
250       __syncthreads();
251     }
252   }
253 }
254 
255 //------------------------------------------------------------------------------
256 // 3D collocated derivatives transpose
257 //------------------------------------------------------------------------------
258 template <int NCOMP, int Q1d>
259 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
260   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
261     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
262       // X derivative
263       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP];
264       __syncthreads();
265       for (CeedInt i = 0; i < Q1d; ++i)
266         r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
267       __syncthreads();
268       // Y derivative
269       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP];
270       __syncthreads();
271       for (CeedInt i = 0; i < Q1d; ++i)
272         r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
273       __syncthreads();
274       // Z derivative
275       for (CeedInt i = 0; i < Q1d; ++i)
276         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
277     }
278   }
279 }
280 
281 #endif
282