xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for CUDA backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
LoadMatrix(SharedData_Cuda & data,const CeedScalar * __restrict__ d_B,CeedScalar * B)16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
218b97b69aSJeremy L Thompson // AtPoints
228b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
238b97b69aSJeremy L Thompson 
248b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
258b97b69aSJeremy L Thompson // L-vector -> single point
268b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
278b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
ReadPoint(SharedData_Cuda & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * r_u)288b97b69aSJeremy L Thompson inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
298b97b69aSJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
308b97b69aSJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
318b97b69aSJeremy L Thompson 
328b97b69aSJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
338b97b69aSJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
348b97b69aSJeremy L Thompson   }
358b97b69aSJeremy L Thompson }
368b97b69aSJeremy L Thompson 
378b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
388b97b69aSJeremy L Thompson // Single point -> L-vector
398b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
408b97b69aSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
WritePoint(SharedData_Cuda & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_u,CeedScalar * d_u)418b97b69aSJeremy L Thompson inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
428b97b69aSJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
438b97b69aSJeremy L Thompson   if (p < points_in_elem) {
448b97b69aSJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
458b97b69aSJeremy L Thompson 
468b97b69aSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
478b97b69aSJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
488b97b69aSJeremy L Thompson     }
498b97b69aSJeremy L Thompson   }
508b97b69aSJeremy L Thompson }
518b97b69aSJeremy L Thompson 
528b97b69aSJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
570183ed61SJeremy L Thompson // Set E-vector value
580183ed61SJeremy L Thompson //------------------------------------------------------------------------------
590183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard1d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)600183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
610183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
620183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
630183ed61SJeremy L Thompson 
640183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
650183ed61SJeremy L Thompson     r_v[target_comp] = value;
660183ed61SJeremy L Thompson   }
670183ed61SJeremy L Thompson }
680183ed61SJeremy L Thompson 
690183ed61SJeremy L Thompson //------------------------------------------------------------------------------
709e201c85SYohann // L-vector -> E-vector, offsets provided
719e201c85SYohann //------------------------------------------------------------------------------
7299421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard1d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
7599421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
769e201c85SYohann     const CeedInt node = data.t_id_x;
7799421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
78ca735530SJeremy L Thompson 
79ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // L-vector -> E-vector, strided
859e201c85SYohann //------------------------------------------------------------------------------
8699421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided1d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
8999421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
909e201c85SYohann     const CeedInt node = data.t_id_x;
919e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92ca735530SJeremy L Thompson 
93ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
949e201c85SYohann   }
959e201c85SYohann }
969e201c85SYohann 
979e201c85SYohann //------------------------------------------------------------------------------
989e201c85SYohann // E-vector -> L-vector, offsets provided
999e201c85SYohann //------------------------------------------------------------------------------
10099421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)101f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
10399421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1049e201c85SYohann     const CeedInt node = data.t_id_x;
10599421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
106ca735530SJeremy L Thompson 
107ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1089e201c85SYohann   }
1099e201c85SYohann }
1109e201c85SYohann 
1110183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)1120183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
1130183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
1140183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
1150183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
1160183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
1170183ed61SJeremy L Thompson 
1180183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
1190183ed61SJeremy L Thompson     const CeedInt ind = indices[target_node + elem * P_1D];
1200183ed61SJeremy L Thompson 
1210183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
1220183ed61SJeremy L Thompson   }
1230183ed61SJeremy L Thompson }
1240183ed61SJeremy L Thompson 
1259e201c85SYohann //------------------------------------------------------------------------------
126915834c9SZach Atkins // E-vector -> L-vector, full assembly
127915834c9SZach Atkins //------------------------------------------------------------------------------
128915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)129915834c9SZach Atkins inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
130915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
131915834c9SZach Atkins   const CeedInt in_comp    = in / P_1D;
132915834c9SZach Atkins   const CeedInt in_node    = in % P_1D;
133915834c9SZach Atkins   const CeedInt e_vec_size = P_1D * NUM_COMP;
134915834c9SZach Atkins 
135915834c9SZach Atkins   if (data.t_id_x < P_1D) {
136915834c9SZach Atkins     const CeedInt out_node = data.t_id_x;
137915834c9SZach Atkins 
138915834c9SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
139915834c9SZach Atkins       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
140915834c9SZach Atkins     }
141915834c9SZach Atkins   }
142915834c9SZach Atkins }
143915834c9SZach Atkins 
144915834c9SZach Atkins //------------------------------------------------------------------------------
1450816752eSJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
1460816752eSJeremy L Thompson //------------------------------------------------------------------------------
1470816752eSJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard1d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)1480816752eSJeremy L Thompson inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
1490816752eSJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
1500816752eSJeremy L Thompson   if (data.t_id_x < Q_1D) {
1510816752eSJeremy L Thompson     const CeedInt ind = data.t_id_x + elem * Q_1D;
1520816752eSJeremy L Thompson 
1530816752eSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
1540816752eSJeremy L Thompson       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
1550816752eSJeremy L Thompson     }
1560816752eSJeremy L Thompson   }
1570816752eSJeremy L Thompson }
1580816752eSJeremy L Thompson 
1590816752eSJeremy L Thompson //------------------------------------------------------------------------------
1609e201c85SYohann // E-vector -> L-vector, strided
1619e201c85SYohann //------------------------------------------------------------------------------
16299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided1d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)163f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
164672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
16599421279SJeremy L Thompson   if (data.t_id_x < P_1D) {
1669e201c85SYohann     const CeedInt node = data.t_id_x;
1679e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
168ca735530SJeremy L Thompson 
169ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1709e201c85SYohann   }
1719e201c85SYohann }
1729e201c85SYohann 
1739e201c85SYohann //------------------------------------------------------------------------------
1749e201c85SYohann // 2D
1759e201c85SYohann //------------------------------------------------------------------------------
1769e201c85SYohann 
1779e201c85SYohann //------------------------------------------------------------------------------
1780183ed61SJeremy L Thompson // Set E-vector value
1790183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1800183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard2d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)1810183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
1820183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
1830183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
1840183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
1850183ed61SJeremy L Thompson 
1860183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
1870183ed61SJeremy L Thompson     r_v[target_comp] = value;
1880183ed61SJeremy L Thompson   }
1890183ed61SJeremy L Thompson }
1900183ed61SJeremy L Thompson 
1910183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1929e201c85SYohann // L-vector -> E-vector, offsets provided
1939e201c85SYohann //------------------------------------------------------------------------------
19499421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard2d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)195f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
196672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
19799421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
19899421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
19999421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
200ca735530SJeremy L Thompson 
201ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
2029e201c85SYohann   }
2039e201c85SYohann }
2049e201c85SYohann 
2059e201c85SYohann //------------------------------------------------------------------------------
2069e201c85SYohann // L-vector -> E-vector, strided
2079e201c85SYohann //------------------------------------------------------------------------------
20899421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided2d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)209f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
210672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
21199421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
21299421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2139e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
214ca735530SJeremy L Thompson 
215ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2169e201c85SYohann   }
2179e201c85SYohann }
2189e201c85SYohann 
2199e201c85SYohann //------------------------------------------------------------------------------
2209e201c85SYohann // E-vector -> L-vector, offsets provided
2219e201c85SYohann //------------------------------------------------------------------------------
22299421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)223f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
224672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
22599421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
22699421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
22799421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
228ca735530SJeremy L Thompson 
229ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
2309e201c85SYohann   }
2319e201c85SYohann }
2329e201c85SYohann 
2330183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)2340183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
2350183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
2360183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
2370183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
2380183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2390183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
2400183ed61SJeremy L Thompson 
2410183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2420183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2430183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
2440183ed61SJeremy L Thompson 
2450183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
2460183ed61SJeremy L Thompson   }
2470183ed61SJeremy L Thompson }
2480183ed61SJeremy L Thompson 
2499e201c85SYohann //------------------------------------------------------------------------------
250915834c9SZach Atkins // E-vector -> L-vector, full assembly
251915834c9SZach Atkins //------------------------------------------------------------------------------
252915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)253915834c9SZach Atkins inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
254915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
255915834c9SZach Atkins   const CeedInt elem_size  = P_1D * P_1D;
256915834c9SZach Atkins   const CeedInt in_comp    = in / elem_size;
257915834c9SZach Atkins   const CeedInt in_node_x  = in % P_1D;
258915834c9SZach Atkins   const CeedInt in_node_y  = (in % elem_size) / P_1D;
259915834c9SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
260915834c9SZach Atkins 
261915834c9SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
262915834c9SZach Atkins     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
263915834c9SZach Atkins     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
264915834c9SZach Atkins 
265915834c9SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266915834c9SZach Atkins       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
267915834c9SZach Atkins 
268915834c9SZach Atkins       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
269915834c9SZach Atkins     }
270915834c9SZach Atkins   }
271915834c9SZach Atkins }
272915834c9SZach Atkins 
273915834c9SZach Atkins //------------------------------------------------------------------------------
2740816752eSJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
2750816752eSJeremy L Thompson //------------------------------------------------------------------------------
2760816752eSJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard2d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)2770816752eSJeremy L Thompson inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
2780816752eSJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
2790816752eSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
2800816752eSJeremy L Thompson     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
2810816752eSJeremy L Thompson 
2820816752eSJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
2830816752eSJeremy L Thompson       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
2840816752eSJeremy L Thompson     }
2850816752eSJeremy L Thompson   }
2860816752eSJeremy L Thompson }
2870816752eSJeremy L Thompson 
2880816752eSJeremy L Thompson //------------------------------------------------------------------------------
2899e201c85SYohann // E-vector -> L-vector, strided
2909e201c85SYohann //------------------------------------------------------------------------------
29199421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided2d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)292f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
293672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
29499421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
29599421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2969e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
297ca735530SJeremy L Thompson 
298ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2999e201c85SYohann   }
3009e201c85SYohann }
3019e201c85SYohann 
3029e201c85SYohann //------------------------------------------------------------------------------
3039e201c85SYohann // 3D
3049e201c85SYohann //------------------------------------------------------------------------------
3059e201c85SYohann 
3069e201c85SYohann //------------------------------------------------------------------------------
3070183ed61SJeremy L Thompson // Set E-vector value
3080183ed61SJeremy L Thompson //------------------------------------------------------------------------------
3090183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard3d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)3100183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
3110183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
3120183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
3130183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
3140183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
3150183ed61SJeremy L Thompson 
3160183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
3170183ed61SJeremy L Thompson     r_v[target_node_z + target_comp * P_1D] = value;
3180183ed61SJeremy L Thompson   }
3190183ed61SJeremy L Thompson }
3200183ed61SJeremy L Thompson 
3210183ed61SJeremy L Thompson //------------------------------------------------------------------------------
3229e201c85SYohann // L-vector -> E-vector, offsets provided
3239e201c85SYohann //------------------------------------------------------------------------------
32499421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard3d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)325f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
326672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
32799421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
32899421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
32999421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
33099421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
331ca735530SJeremy L Thompson 
33299421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
3339e201c85SYohann     }
3349e201c85SYohann   }
335688b5473SJeremy L Thompson }
3369e201c85SYohann 
3379e201c85SYohann //------------------------------------------------------------------------------
3389e201c85SYohann // L-vector -> E-vector, strided
3399e201c85SYohann //------------------------------------------------------------------------------
34099421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)341f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
342672b0f2aSSebastian Grimberg                                          CeedScalar *__restrict__ r_u) {
34399421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
34499421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
34599421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3469e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
347ca735530SJeremy L Thompson 
34899421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
3499e201c85SYohann     }
3509e201c85SYohann   }
351688b5473SJeremy L Thompson }
3529e201c85SYohann 
3539e201c85SYohann //------------------------------------------------------------------------------
3549e201c85SYohann // E-vector -> Q-vector, offests provided
3559e201c85SYohann //------------------------------------------------------------------------------
35699421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
ReadEVecSliceStandard3d(SharedData_Cuda & data,const CeedInt nquads,const CeedInt elem,const CeedInt q,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)357f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
358f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
359f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
36099421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
36199421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
36299421279SJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
363ca735530SJeremy L Thompson 
364ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
3659e201c85SYohann   }
3669e201c85SYohann }
3679e201c85SYohann 
3689e201c85SYohann //------------------------------------------------------------------------------
3699e201c85SYohann // E-vector -> Q-vector, strided
3709e201c85SYohann //------------------------------------------------------------------------------
37199421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadEVecSliceStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedInt q,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)372f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
373672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
37499421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
37599421279SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3769e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
377ca735530SJeremy L Thompson 
378ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
3799e201c85SYohann   }
3809e201c85SYohann }
3819e201c85SYohann 
3829e201c85SYohann //------------------------------------------------------------------------------
3839e201c85SYohann // E-vector -> L-vector, offsets provided
3849e201c85SYohann //------------------------------------------------------------------------------
38599421279SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)386f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
387672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
38899421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
38999421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
39099421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
39199421279SJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
392ca735530SJeremy L Thompson 
39399421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
3949e201c85SYohann     }
3959e201c85SYohann   }
396688b5473SJeremy L Thompson }
3979e201c85SYohann 
3980183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)3990183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
4000183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
4010183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
4020183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
4030183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
4040183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
4050183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
4060183ed61SJeremy L Thompson 
4070183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
4080183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
4090183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
4100183ed61SJeremy L Thompson 
4110183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
4120183ed61SJeremy L Thompson   }
4130183ed61SJeremy L Thompson }
4140183ed61SJeremy L Thompson 
4159e201c85SYohann //------------------------------------------------------------------------------
416915834c9SZach Atkins // E-vector -> L-vector, full assembly
417915834c9SZach Atkins //------------------------------------------------------------------------------
418915834c9SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)419915834c9SZach Atkins inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
420915834c9SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
421915834c9SZach Atkins   const CeedInt elem_size  = P_1D * P_1D * P_1D;
422915834c9SZach Atkins   const CeedInt in_comp    = in / elem_size;
423915834c9SZach Atkins   const CeedInt in_node_x  = in % P_1D;
424915834c9SZach Atkins   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
425915834c9SZach Atkins   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
426915834c9SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
427915834c9SZach Atkins 
428915834c9SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
429915834c9SZach Atkins     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
430915834c9SZach Atkins     for (CeedInt z = 0; z < P_1D; z++) {
431915834c9SZach Atkins       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
432915834c9SZach Atkins 
433915834c9SZach Atkins       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
434915834c9SZach Atkins         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
435915834c9SZach Atkins 
436915834c9SZach Atkins         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
437915834c9SZach Atkins       }
438915834c9SZach Atkins     }
439915834c9SZach Atkins   }
440915834c9SZach Atkins }
441915834c9SZach Atkins 
442915834c9SZach Atkins //------------------------------------------------------------------------------
4430816752eSJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
4440816752eSJeremy L Thompson //------------------------------------------------------------------------------
4450816752eSJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard3d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)4460816752eSJeremy L Thompson inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
4470816752eSJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
4480816752eSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
4490816752eSJeremy L Thompson     for (CeedInt z = 0; z < Q_1D; z++) {
4500816752eSJeremy L Thompson       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
4510816752eSJeremy L Thompson 
4520816752eSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
4530816752eSJeremy L Thompson         d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
4540816752eSJeremy L Thompson       }
4550816752eSJeremy L Thompson     }
4560816752eSJeremy L Thompson   }
4570816752eSJeremy L Thompson }
4580816752eSJeremy L Thompson 
4590816752eSJeremy L Thompson //------------------------------------------------------------------------------
4609e201c85SYohann // E-vector -> L-vector, strided
4619e201c85SYohann //------------------------------------------------------------------------------
46299421279SJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)463f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
464672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
46599421279SJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
46699421279SJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
46799421279SJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
4689e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
469ca735530SJeremy L Thompson 
47099421279SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
4719e201c85SYohann     }
4729e201c85SYohann   }
473688b5473SJeremy L Thompson }
4749e201c85SYohann 
4759e201c85SYohann //------------------------------------------------------------------------------
4769e201c85SYohann // 3D collocated derivatives computation
4779e201c85SYohann //------------------------------------------------------------------------------
47899421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSlice3d(SharedData_Cuda & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)479f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4802b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
48199421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
482ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483d6c19ee8SJeremy L Thompson       __syncthreads();
48499421279SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
4859e201c85SYohann       __syncthreads();
4869e201c85SYohann       // X derivative
487ca735530SJeremy L Thompson       r_V[comp + 0 * NUM_COMP] = 0.0;
48899421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
48999421279SJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
490688b5473SJeremy L Thompson       }
4919e201c85SYohann       // Y derivative
492ca735530SJeremy L Thompson       r_V[comp + 1 * NUM_COMP] = 0.0;
49399421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
49499421279SJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
495688b5473SJeremy L Thompson       }
4969e201c85SYohann       // Z derivative
497ca735530SJeremy L Thompson       r_V[comp + 2 * NUM_COMP] = 0.0;
49899421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
49999421279SJeremy L Thompson         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
500688b5473SJeremy L Thompson       }
5019e201c85SYohann     }
5029e201c85SYohann   }
5039e201c85SYohann }
5049e201c85SYohann 
5059e201c85SYohann //------------------------------------------------------------------------------
5069e201c85SYohann // 3D collocated derivatives transpose
5079e201c85SYohann //------------------------------------------------------------------------------
50899421279SJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSliceTranspose3d(SharedData_Cuda & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)509f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
5102b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
51199421279SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
512ca735530SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513d6c19ee8SJeremy L Thompson       __syncthreads();
514ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
5159e201c85SYohann       __syncthreads();
516688b5473SJeremy L Thompson       // X derivative
51799421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
51899421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
519688b5473SJeremy L Thompson       }
5209e201c85SYohann       // Y derivative
521d6c19ee8SJeremy L Thompson       __syncthreads();
522ca735530SJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
5239e201c85SYohann       __syncthreads();
52499421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
52599421279SJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
526688b5473SJeremy L Thompson       }
5279e201c85SYohann       // Z derivative
52899421279SJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
52999421279SJeremy L Thompson         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
530688b5473SJeremy L Thompson       }
5319e201c85SYohann     }
5329e201c85SYohann   }
5339e201c85SYohann }
534