xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
218b97b69aSJeremy L Thompson // AtPoints
228b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
238b97b69aSJeremy L Thompson 
248b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
258b97b69aSJeremy L Thompson // L-vector -> single point
268b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
278b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
288b97b69aSJeremy L Thompson inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
298b97b69aSJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
308b97b69aSJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
318b97b69aSJeremy L Thompson 
328b97b69aSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
338b97b69aSJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
348b97b69aSJeremy L Thompson   }
358b97b69aSJeremy L Thompson }
368b97b69aSJeremy L Thompson 
378b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
388b97b69aSJeremy L Thompson // Single point -> L-vector
398b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
408b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
418b97b69aSJeremy L Thompson inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
428b97b69aSJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
438b97b69aSJeremy L Thompson   if (p < points_in_elem) {
448b97b69aSJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
458b97b69aSJeremy L Thompson 
468b97b69aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
478b97b69aSJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
488b97b69aSJeremy L Thompson     }
498b97b69aSJeremy L Thompson   }
508b97b69aSJeremy L Thompson }
518b97b69aSJeremy L Thompson 
528b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
57*0183ed61SJeremy L Thompson // Set E-vector value
58*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
59*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
60*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61*0183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
62*0183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
63*0183ed61SJeremy L Thompson 
64*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
65*0183ed61SJeremy L Thompson     r_v[target_comp] = value;
66*0183ed61SJeremy L Thompson   }
67*0183ed61SJeremy L Thompson }
68*0183ed61SJeremy L Thompson 
69*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
709e201c85SYohann // L-vector -> E-vector, offsets provided
719e201c85SYohann //------------------------------------------------------------------------------
7299421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
7599421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
769e201c85SYohann     const CeedInt node = data.t_id_x;
7799421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
78ca735530SJeremy L Thompson 
79ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // L-vector -> E-vector, strided
859e201c85SYohann //------------------------------------------------------------------------------
8699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
8999421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
909e201c85SYohann     const CeedInt node = data.t_id_x;
919e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92ca735530SJeremy L Thompson 
93ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
949e201c85SYohann   }
959e201c85SYohann }
969e201c85SYohann 
979e201c85SYohann //------------------------------------------------------------------------------
989e201c85SYohann // E-vector -> L-vector, offsets provided
999e201c85SYohann //------------------------------------------------------------------------------
10099421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
101f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
10399421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1049e201c85SYohann     const CeedInt node = data.t_id_x;
10599421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
106ca735530SJeremy L Thompson 
107ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1089e201c85SYohann   }
1099e201c85SYohann }
1109e201c85SYohann 
111*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
112*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
113*0183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
114*0183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
115*0183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
116*0183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
117*0183ed61SJeremy L Thompson 
118*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
119*0183ed61SJeremy L Thompson     const CeedInt ind = indices[target_node + elem * P_1D];
120*0183ed61SJeremy L Thompson 
121*0183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
122*0183ed61SJeremy L Thompson   }
123*0183ed61SJeremy L Thompson }
124*0183ed61SJeremy L Thompson 
1259e201c85SYohann //------------------------------------------------------------------------------
1269e201c85SYohann // E-vector -> L-vector, strided
1279e201c85SYohann //------------------------------------------------------------------------------
12899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
129f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
130672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
13199421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1329e201c85SYohann     const CeedInt node = data.t_id_x;
1339e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
134ca735530SJeremy L Thompson 
135ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1369e201c85SYohann   }
1379e201c85SYohann }
1389e201c85SYohann 
1399e201c85SYohann //------------------------------------------------------------------------------
1409e201c85SYohann // 2D
1419e201c85SYohann //------------------------------------------------------------------------------
1429e201c85SYohann 
1439e201c85SYohann //------------------------------------------------------------------------------
144*0183ed61SJeremy L Thompson // Set E-vector value
145*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
146*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
147*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
148*0183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
149*0183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
150*0183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
151*0183ed61SJeremy L Thompson 
152*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
153*0183ed61SJeremy L Thompson     r_v[target_comp] = value;
154*0183ed61SJeremy L Thompson   }
155*0183ed61SJeremy L Thompson }
156*0183ed61SJeremy L Thompson 
157*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1589e201c85SYohann // L-vector -> E-vector, offsets provided
1599e201c85SYohann //------------------------------------------------------------------------------
16099421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
161f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
162672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
16399421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
16499421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
16599421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
166ca735530SJeremy L Thompson 
167ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
1689e201c85SYohann   }
1699e201c85SYohann }
1709e201c85SYohann 
1719e201c85SYohann //------------------------------------------------------------------------------
1729e201c85SYohann // L-vector -> E-vector, strided
1739e201c85SYohann //------------------------------------------------------------------------------
17499421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
175f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
176672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
17799421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
17899421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
1799e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
180ca735530SJeremy L Thompson 
181ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1829e201c85SYohann   }
1839e201c85SYohann }
1849e201c85SYohann 
1859e201c85SYohann //------------------------------------------------------------------------------
1869e201c85SYohann // E-vector -> L-vector, offsets provided
1879e201c85SYohann //------------------------------------------------------------------------------
18899421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
189f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
190672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
19199421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
19299421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
19399421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
194ca735530SJeremy L Thompson 
195ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1969e201c85SYohann   }
1979e201c85SYohann }
1989e201c85SYohann 
199*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
200*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
201*0183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
202*0183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
203*0183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
204*0183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
205*0183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
206*0183ed61SJeremy L Thompson 
207*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
208*0183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
209*0183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
210*0183ed61SJeremy L Thompson 
211*0183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
212*0183ed61SJeremy L Thompson   }
213*0183ed61SJeremy L Thompson }
214*0183ed61SJeremy L Thompson 
2159e201c85SYohann //------------------------------------------------------------------------------
2169e201c85SYohann // E-vector -> L-vector, strided
2179e201c85SYohann //------------------------------------------------------------------------------
21899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
219f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
220672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
22199421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
22299421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2239e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
224ca735530SJeremy L Thompson 
225ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2269e201c85SYohann   }
2279e201c85SYohann }
2289e201c85SYohann 
2299e201c85SYohann //------------------------------------------------------------------------------
2309e201c85SYohann // 3D
2319e201c85SYohann //------------------------------------------------------------------------------
2329e201c85SYohann 
2339e201c85SYohann //------------------------------------------------------------------------------
234*0183ed61SJeremy L Thompson // Set E-vector value
235*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
236*0183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
237*0183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
238*0183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
239*0183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
240*0183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
241*0183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
242*0183ed61SJeremy L Thompson 
243*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
244*0183ed61SJeremy L Thompson     r_v[target_node_z + target_comp * P_1D] = value;
245*0183ed61SJeremy L Thompson   }
246*0183ed61SJeremy L Thompson }
247*0183ed61SJeremy L Thompson 
248*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2499e201c85SYohann // L-vector -> E-vector, offsets provided
2509e201c85SYohann //------------------------------------------------------------------------------
25199421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
252f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
253672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
25499421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
25599421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
25699421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
25799421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
258ca735530SJeremy L Thompson 
25999421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
2609e201c85SYohann     }
2619e201c85SYohann   }
262688b5473SJeremy L Thompson }
2639e201c85SYohann 
2649e201c85SYohann //------------------------------------------------------------------------------
2659e201c85SYohann // L-vector -> E-vector, strided
2669e201c85SYohann //------------------------------------------------------------------------------
26799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
268f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
269672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
27099421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
27199421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
27299421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
2739e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
274ca735530SJeremy L Thompson 
27599421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
2769e201c85SYohann     }
2779e201c85SYohann   }
278688b5473SJeremy L Thompson }
2799e201c85SYohann 
2809e201c85SYohann //------------------------------------------------------------------------------
2819e201c85SYohann // E-vector -> Q-vector, offests provided
2829e201c85SYohann //------------------------------------------------------------------------------
28399421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
284f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
285f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
286f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
28799421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
28899421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
28999421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
290ca735530SJeremy L Thompson 
291ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
2929e201c85SYohann   }
2939e201c85SYohann }
2949e201c85SYohann 
2959e201c85SYohann //------------------------------------------------------------------------------
2969e201c85SYohann // E-vector -> Q-vector, strided
2979e201c85SYohann //------------------------------------------------------------------------------
29899421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
299f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
300672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
30199421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
30299421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3039e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
304ca735530SJeremy L Thompson 
305ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
3069e201c85SYohann   }
3079e201c85SYohann }
3089e201c85SYohann 
3099e201c85SYohann //------------------------------------------------------------------------------
3109e201c85SYohann // E-vector -> L-vector, offsets provided
3119e201c85SYohann //------------------------------------------------------------------------------
31299421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
313f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
314672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
31599421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
31699421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
31799421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
31899421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
319ca735530SJeremy L Thompson 
32099421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
3219e201c85SYohann     }
3229e201c85SYohann   }
323688b5473SJeremy L Thompson }
3249e201c85SYohann 
325*0183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
326*0183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
327*0183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
328*0183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
329*0183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
330*0183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
331*0183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
332*0183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
333*0183ed61SJeremy L Thompson 
334*0183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
335*0183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
336*0183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
337*0183ed61SJeremy L Thompson 
338*0183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
339*0183ed61SJeremy L Thompson   }
340*0183ed61SJeremy L Thompson }
341*0183ed61SJeremy L Thompson 
3429e201c85SYohann //------------------------------------------------------------------------------
3439e201c85SYohann // E-vector -> L-vector, strided
3449e201c85SYohann //------------------------------------------------------------------------------
34599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
346f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
347672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
34899421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
34999421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
35099421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3519e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
352ca735530SJeremy L Thompson 
35399421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
3549e201c85SYohann     }
3559e201c85SYohann   }
356688b5473SJeremy L Thompson }
3579e201c85SYohann 
3589e201c85SYohann //------------------------------------------------------------------------------
3599e201c85SYohann // 3D collocated derivatives computation
3609e201c85SYohann //------------------------------------------------------------------------------
36199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
362f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
3632b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
36499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
365ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
366d6c19ee8SJeremy L Thompson       __syncthreads();
36799421279SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
3689e201c85SYohann       __syncthreads();
3699e201c85SYohann       // X derivative
370ca735530SJeremy L Thompson       r_V[comp + 0 * NUM_COMP] = 0.0;
37199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
37299421279SJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
373688b5473SJeremy L Thompson       }
3749e201c85SYohann       // Y derivative
375ca735530SJeremy L Thompson       r_V[comp + 1 * NUM_COMP] = 0.0;
37699421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
37799421279SJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
378688b5473SJeremy L Thompson       }
3799e201c85SYohann       // Z derivative
380ca735530SJeremy L Thompson       r_V[comp + 2 * NUM_COMP] = 0.0;
38199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
38299421279SJeremy L Thompson         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
383688b5473SJeremy L Thompson       }
3849e201c85SYohann     }
3859e201c85SYohann   }
3869e201c85SYohann }
3879e201c85SYohann 
3889e201c85SYohann //------------------------------------------------------------------------------
3899e201c85SYohann // 3D collocated derivatives transpose
3909e201c85SYohann //------------------------------------------------------------------------------
39199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
392f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
3932b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
39499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
395ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
396d6c19ee8SJeremy L Thompson       __syncthreads();
397ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
3989e201c85SYohann       __syncthreads();
399688b5473SJeremy L Thompson       // X derivative
40099421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
40199421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
402688b5473SJeremy L Thompson       }
4039e201c85SYohann       // Y derivative
404d6c19ee8SJeremy L Thompson       __syncthreads();
405ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
4069e201c85SYohann       __syncthreads();
40799421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
40899421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
409688b5473SJeremy L Thompson       }
4109e201c85SYohann       // Z derivative
41199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
41299421279SJeremy L Thompson         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
413688b5473SJeremy L Thompson       }
4149e201c85SYohann     }
4159e201c85SYohann   }
4169e201c85SYohann }
417