xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 915834c9f1e582e3fdfc87db6b4fa4e010d293bb)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
218b97b69aSJeremy L Thompson // AtPoints
228b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
238b97b69aSJeremy L Thompson 
248b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
258b97b69aSJeremy L Thompson // L-vector -> single point
268b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
278b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
288b97b69aSJeremy L Thompson inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
298b97b69aSJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
308b97b69aSJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
318b97b69aSJeremy L Thompson 
328b97b69aSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
338b97b69aSJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
348b97b69aSJeremy L Thompson   }
358b97b69aSJeremy L Thompson }
368b97b69aSJeremy L Thompson 
378b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
388b97b69aSJeremy L Thompson // Single point -> L-vector
398b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
408b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
418b97b69aSJeremy L Thompson inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
428b97b69aSJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
438b97b69aSJeremy L Thompson   if (p < points_in_elem) {
448b97b69aSJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
458b97b69aSJeremy L Thompson 
468b97b69aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
478b97b69aSJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
488b97b69aSJeremy L Thompson     }
498b97b69aSJeremy L Thompson   }
508b97b69aSJeremy L Thompson }
518b97b69aSJeremy L Thompson 
528b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
570183ed61SJeremy L Thompson // Set E-vector value
580183ed61SJeremy L Thompson //------------------------------------------------------------------------------
590183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
600183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
610183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
620183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
630183ed61SJeremy L Thompson 
640183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
650183ed61SJeremy L Thompson     r_v[target_comp] = value;
660183ed61SJeremy L Thompson   }
670183ed61SJeremy L Thompson }
680183ed61SJeremy L Thompson 
690183ed61SJeremy L Thompson //------------------------------------------------------------------------------
709e201c85SYohann // L-vector -> E-vector, offsets provided
719e201c85SYohann //------------------------------------------------------------------------------
7299421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
7599421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
769e201c85SYohann     const CeedInt node = data.t_id_x;
7799421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
78ca735530SJeremy L Thompson 
79ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // L-vector -> E-vector, strided
859e201c85SYohann //------------------------------------------------------------------------------
8699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
8999421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
909e201c85SYohann     const CeedInt node = data.t_id_x;
919e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92ca735530SJeremy L Thompson 
93ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
949e201c85SYohann   }
959e201c85SYohann }
969e201c85SYohann 
979e201c85SYohann //------------------------------------------------------------------------------
989e201c85SYohann // E-vector -> L-vector, offsets provided
999e201c85SYohann //------------------------------------------------------------------------------
10099421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
101f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
10399421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1049e201c85SYohann     const CeedInt node = data.t_id_x;
10599421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
106ca735530SJeremy L Thompson 
107ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1089e201c85SYohann   }
1099e201c85SYohann }
1109e201c85SYohann 
1110183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
1120183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
1130183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
1140183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
1150183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
1160183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
1170183ed61SJeremy L Thompson 
1180183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
1190183ed61SJeremy L Thompson     const CeedInt ind = indices[target_node + elem * P_1D];
1200183ed61SJeremy L Thompson 
1210183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
1220183ed61SJeremy L Thompson   }
1230183ed61SJeremy L Thompson }
1240183ed61SJeremy L Thompson 
1259e201c85SYohann //------------------------------------------------------------------------------
126*915834c9SZach Atkins // E-vector -> L-vector, full assembly
127*915834c9SZach Atkins //------------------------------------------------------------------------------
128*915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
129*915834c9SZach Atkins inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
130*915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
131*915834c9SZach Atkins   const CeedInt in_comp    = in / P_1D;
132*915834c9SZach Atkins   const CeedInt in_node    = in % P_1D;
133*915834c9SZach Atkins   const CeedInt e_vec_size = P_1D * NUM_COMP;
134*915834c9SZach Atkins 
135*915834c9SZach Atkins   if (data.t_id_x < P_1D) {
136*915834c9SZach Atkins     const CeedInt out_node = data.t_id_x;
137*915834c9SZach Atkins 
138*915834c9SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
139*915834c9SZach Atkins       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
140*915834c9SZach Atkins     }
141*915834c9SZach Atkins   }
142*915834c9SZach Atkins }
143*915834c9SZach Atkins 
144*915834c9SZach Atkins //------------------------------------------------------------------------------
1459e201c85SYohann // E-vector -> L-vector, strided
1469e201c85SYohann //------------------------------------------------------------------------------
14799421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
148f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
149672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
15099421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1519e201c85SYohann     const CeedInt node = data.t_id_x;
1529e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
153ca735530SJeremy L Thompson 
154ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1559e201c85SYohann   }
1569e201c85SYohann }
1579e201c85SYohann 
1589e201c85SYohann //------------------------------------------------------------------------------
1599e201c85SYohann // 2D
1609e201c85SYohann //------------------------------------------------------------------------------
1619e201c85SYohann 
1629e201c85SYohann //------------------------------------------------------------------------------
1630183ed61SJeremy L Thompson // Set E-vector value
1640183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1650183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
1660183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
1670183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
1680183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
1690183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
1700183ed61SJeremy L Thompson 
1710183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
1720183ed61SJeremy L Thompson     r_v[target_comp] = value;
1730183ed61SJeremy L Thompson   }
1740183ed61SJeremy L Thompson }
1750183ed61SJeremy L Thompson 
1760183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1779e201c85SYohann // L-vector -> E-vector, offsets provided
1789e201c85SYohann //------------------------------------------------------------------------------
17999421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
180f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
18299421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
18399421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
18499421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
185ca735530SJeremy L Thompson 
186ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
1879e201c85SYohann   }
1889e201c85SYohann }
1899e201c85SYohann 
1909e201c85SYohann //------------------------------------------------------------------------------
1919e201c85SYohann // L-vector -> E-vector, strided
1929e201c85SYohann //------------------------------------------------------------------------------
19399421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
194f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
195672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
19699421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
19799421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
1989e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
199ca735530SJeremy L Thompson 
200ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2019e201c85SYohann   }
2029e201c85SYohann }
2039e201c85SYohann 
2049e201c85SYohann //------------------------------------------------------------------------------
2059e201c85SYohann // E-vector -> L-vector, offsets provided
2069e201c85SYohann //------------------------------------------------------------------------------
20799421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
208f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
209672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
21099421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
21199421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
21299421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
213ca735530SJeremy L Thompson 
214ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
2159e201c85SYohann   }
2169e201c85SYohann }
2179e201c85SYohann 
2180183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
2190183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
2200183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
2210183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
2220183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
2230183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2240183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
2250183ed61SJeremy L Thompson 
2260183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2270183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2280183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
2290183ed61SJeremy L Thompson 
2300183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
2310183ed61SJeremy L Thompson   }
2320183ed61SJeremy L Thompson }
2330183ed61SJeremy L Thompson 
2349e201c85SYohann //------------------------------------------------------------------------------
235*915834c9SZach Atkins // E-vector -> L-vector, full assembly
236*915834c9SZach Atkins //------------------------------------------------------------------------------
237*915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
238*915834c9SZach Atkins inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
239*915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
240*915834c9SZach Atkins   const CeedInt elem_size  = P_1D * P_1D;
241*915834c9SZach Atkins   const CeedInt in_comp    = in / elem_size;
242*915834c9SZach Atkins   const CeedInt in_node_x  = in % P_1D;
243*915834c9SZach Atkins   const CeedInt in_node_y  = (in % elem_size) / P_1D;
244*915834c9SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
245*915834c9SZach Atkins 
246*915834c9SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
247*915834c9SZach Atkins     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
248*915834c9SZach Atkins     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
249*915834c9SZach Atkins 
250*915834c9SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
251*915834c9SZach Atkins       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
252*915834c9SZach Atkins 
253*915834c9SZach Atkins       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
254*915834c9SZach Atkins     }
255*915834c9SZach Atkins   }
256*915834c9SZach Atkins }
257*915834c9SZach Atkins 
258*915834c9SZach Atkins //------------------------------------------------------------------------------
2599e201c85SYohann // E-vector -> L-vector, strided
2609e201c85SYohann //------------------------------------------------------------------------------
26199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
262f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
263672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
26499421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
26599421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2669e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
267ca735530SJeremy L Thompson 
268ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2699e201c85SYohann   }
2709e201c85SYohann }
2719e201c85SYohann 
2729e201c85SYohann //------------------------------------------------------------------------------
2739e201c85SYohann // 3D
2749e201c85SYohann //------------------------------------------------------------------------------
2759e201c85SYohann 
2769e201c85SYohann //------------------------------------------------------------------------------
2770183ed61SJeremy L Thompson // Set E-vector value
2780183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2790183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
2800183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
2810183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
2820183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2830183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
2840183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
2850183ed61SJeremy L Thompson 
2860183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2870183ed61SJeremy L Thompson     r_v[target_node_z + target_comp * P_1D] = value;
2880183ed61SJeremy L Thompson   }
2890183ed61SJeremy L Thompson }
2900183ed61SJeremy L Thompson 
2910183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2929e201c85SYohann // L-vector -> E-vector, offsets provided
2939e201c85SYohann //------------------------------------------------------------------------------
29499421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
295f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
296672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
29799421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
29899421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
29999421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
30099421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
301ca735530SJeremy L Thompson 
30299421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
3039e201c85SYohann     }
3049e201c85SYohann   }
305688b5473SJeremy L Thompson }
3069e201c85SYohann 
3079e201c85SYohann //------------------------------------------------------------------------------
3089e201c85SYohann // L-vector -> E-vector, strided
3099e201c85SYohann //------------------------------------------------------------------------------
31099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
311f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
312672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
31399421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
31499421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
31599421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3169e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
317ca735530SJeremy L Thompson 
31899421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
3199e201c85SYohann     }
3209e201c85SYohann   }
321688b5473SJeremy L Thompson }
3229e201c85SYohann 
3239e201c85SYohann //------------------------------------------------------------------------------
3249e201c85SYohann // E-vector -> Q-vector, offests provided
3259e201c85SYohann //------------------------------------------------------------------------------
32699421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
327f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
328f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
329f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
33099421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
33199421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
33299421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
333ca735530SJeremy L Thompson 
334ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
3359e201c85SYohann   }
3369e201c85SYohann }
3379e201c85SYohann 
3389e201c85SYohann //------------------------------------------------------------------------------
3399e201c85SYohann // E-vector -> Q-vector, strided
3409e201c85SYohann //------------------------------------------------------------------------------
34199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
342f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
343672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
34499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
34599421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3469e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
347ca735530SJeremy L Thompson 
348ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
3499e201c85SYohann   }
3509e201c85SYohann }
3519e201c85SYohann 
3529e201c85SYohann //------------------------------------------------------------------------------
3539e201c85SYohann // E-vector -> L-vector, offsets provided
3549e201c85SYohann //------------------------------------------------------------------------------
35599421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
356f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
357672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
35899421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
35999421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
36099421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
36199421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
362ca735530SJeremy L Thompson 
36399421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
3649e201c85SYohann     }
3659e201c85SYohann   }
366688b5473SJeremy L Thompson }
3679e201c85SYohann 
3680183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
3690183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
3700183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
3710183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
3720183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
3730183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
3740183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
3750183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
3760183ed61SJeremy L Thompson 
3770183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
3780183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
3790183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
3800183ed61SJeremy L Thompson 
3810183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
3820183ed61SJeremy L Thompson   }
3830183ed61SJeremy L Thompson }
3840183ed61SJeremy L Thompson 
3859e201c85SYohann //------------------------------------------------------------------------------
386*915834c9SZach Atkins // E-vector -> L-vector, full assembly
387*915834c9SZach Atkins //------------------------------------------------------------------------------
388*915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
389*915834c9SZach Atkins inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
390*915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
391*915834c9SZach Atkins   const CeedInt elem_size  = P_1D * P_1D * P_1D;
392*915834c9SZach Atkins   const CeedInt in_comp    = in / elem_size;
393*915834c9SZach Atkins   const CeedInt in_node_x  = in % P_1D;
394*915834c9SZach Atkins   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
395*915834c9SZach Atkins   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
396*915834c9SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
397*915834c9SZach Atkins 
398*915834c9SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
399*915834c9SZach Atkins     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
400*915834c9SZach Atkins     for (CeedInt z = 0; z < P_1D; z++) {
401*915834c9SZach Atkins       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
402*915834c9SZach Atkins 
403*915834c9SZach Atkins       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
404*915834c9SZach Atkins         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
405*915834c9SZach Atkins 
406*915834c9SZach Atkins         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
407*915834c9SZach Atkins       }
408*915834c9SZach Atkins     }
409*915834c9SZach Atkins   }
410*915834c9SZach Atkins }
411*915834c9SZach Atkins 
412*915834c9SZach Atkins //------------------------------------------------------------------------------
4139e201c85SYohann // E-vector -> L-vector, strided
4149e201c85SYohann //------------------------------------------------------------------------------
41599421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
416f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
417672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
41899421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
41999421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
42099421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
4219e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
422ca735530SJeremy L Thompson 
42399421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
4249e201c85SYohann     }
4259e201c85SYohann   }
426688b5473SJeremy L Thompson }
4279e201c85SYohann 
4289e201c85SYohann //------------------------------------------------------------------------------
4299e201c85SYohann // 3D collocated derivatives computation
4309e201c85SYohann //------------------------------------------------------------------------------
43199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
432f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4332b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
43499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
435ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
436d6c19ee8SJeremy L Thompson       __syncthreads();
43799421279SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
4389e201c85SYohann       __syncthreads();
4399e201c85SYohann       // X derivative
440ca735530SJeremy L Thompson       r_V[comp + 0 * NUM_COMP] = 0.0;
44199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
44299421279SJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
443688b5473SJeremy L Thompson       }
4449e201c85SYohann       // Y derivative
445ca735530SJeremy L Thompson       r_V[comp + 1 * NUM_COMP] = 0.0;
44699421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
44799421279SJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
448688b5473SJeremy L Thompson       }
4499e201c85SYohann       // Z derivative
450ca735530SJeremy L Thompson       r_V[comp + 2 * NUM_COMP] = 0.0;
45199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
45299421279SJeremy L Thompson         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
453688b5473SJeremy L Thompson       }
4549e201c85SYohann     }
4559e201c85SYohann   }
4569e201c85SYohann }
4579e201c85SYohann 
4589e201c85SYohann //------------------------------------------------------------------------------
4599e201c85SYohann // 3D collocated derivatives transpose
4609e201c85SYohann //------------------------------------------------------------------------------
46199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
462f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4632b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
46499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
465ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
466d6c19ee8SJeremy L Thompson       __syncthreads();
467ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
4689e201c85SYohann       __syncthreads();
469688b5473SJeremy L Thompson       // X derivative
47099421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
47199421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
472688b5473SJeremy L Thompson       }
4739e201c85SYohann       // Y derivative
474d6c19ee8SJeremy L Thompson       __syncthreads();
475ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
4769e201c85SYohann       __syncthreads();
47799421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
47899421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
479688b5473SJeremy L Thompson       }
4809e201c85SYohann       // Z derivative
48199421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
48299421279SJeremy L Thompson         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
483688b5473SJeremy L Thompson       }
4849e201c85SYohann     }
4859e201c85SYohann   }
4869e201c85SYohann }
487