xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
119e201c85SYohann 
129e201c85SYohann //------------------------------------------------------------------------------
139e201c85SYohann // Load matrices for basis actions
149e201c85SYohann //------------------------------------------------------------------------------
159e201c85SYohann template <int P, int Q>
LoadMatrix(SharedData_Hip & data,const CeedScalar * __restrict__ d_B,CeedScalar * B)16f815fac9SJeremy L Thompson inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
172b730f8bSJeremy L Thompson   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
189e201c85SYohann }
199e201c85SYohann 
209e201c85SYohann //------------------------------------------------------------------------------
213a2968d6SJeremy L Thompson // AtPoints
223a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
233a2968d6SJeremy L Thompson 
243a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
253a2968d6SJeremy L Thompson // L-vector -> single point
263a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
273a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
ReadPoint(SharedData_Hip & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * r_u)283a2968d6SJeremy L Thompson inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
293a2968d6SJeremy L Thompson                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
303a2968d6SJeremy L Thompson   const CeedInt ind = indices[p + elem * NUM_PTS];
313a2968d6SJeremy L Thompson 
323a2968d6SJeremy L Thompson   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
333a2968d6SJeremy L Thompson     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
343a2968d6SJeremy L Thompson   }
353a2968d6SJeremy L Thompson }
363a2968d6SJeremy L Thompson 
373a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
383a2968d6SJeremy L Thompson // Single point -> L-vector
393a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
403a2968d6SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
WritePoint(SharedData_Hip & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_u,CeedScalar * d_u)413a2968d6SJeremy L Thompson inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
423a2968d6SJeremy L Thompson                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
433a2968d6SJeremy L Thompson   if (p < points_in_elem) {
443a2968d6SJeremy L Thompson     const CeedInt ind = indices[p + elem * NUM_PTS];
453a2968d6SJeremy L Thompson 
463a2968d6SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
473a2968d6SJeremy L Thompson       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
483a2968d6SJeremy L Thompson     }
493a2968d6SJeremy L Thompson   }
503a2968d6SJeremy L Thompson }
513a2968d6SJeremy L Thompson 
523a2968d6SJeremy L Thompson //------------------------------------------------------------------------------
539e201c85SYohann // 1D
549e201c85SYohann //------------------------------------------------------------------------------
559e201c85SYohann 
569e201c85SYohann //------------------------------------------------------------------------------
570183ed61SJeremy L Thompson // Set E-vector value
580183ed61SJeremy L Thompson //------------------------------------------------------------------------------
590183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard1d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)600183ed61SJeremy L Thompson inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
610183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
620183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
630183ed61SJeremy L Thompson 
640183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
650183ed61SJeremy L Thompson     r_v[target_comp] = value;
660183ed61SJeremy L Thompson   }
670183ed61SJeremy L Thompson }
680183ed61SJeremy L Thompson 
690183ed61SJeremy L Thompson //------------------------------------------------------------------------------
709e201c85SYohann // L-vector -> E-vector, offsets provided
719e201c85SYohann //------------------------------------------------------------------------------
726b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard1d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)73f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
756b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
769e201c85SYohann     const CeedInt node = data.t_id_x;
776b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
78672b0f2aSSebastian Grimberg 
79672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
809e201c85SYohann   }
819e201c85SYohann }
829e201c85SYohann 
839e201c85SYohann //------------------------------------------------------------------------------
849e201c85SYohann // L-vector -> E-vector, strided
859e201c85SYohann //------------------------------------------------------------------------------
866b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided1d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)87f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
886b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
899e201c85SYohann     const CeedInt node = data.t_id_x;
909e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
91672b0f2aSSebastian Grimberg 
92672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
939e201c85SYohann   }
949e201c85SYohann }
959e201c85SYohann 
969e201c85SYohann //------------------------------------------------------------------------------
979e201c85SYohann // E-vector -> L-vector, offsets provided
989e201c85SYohann //------------------------------------------------------------------------------
996b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)100f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
101672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
1026b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
1039e201c85SYohann     const CeedInt node = data.t_id_x;
1046b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D];
105672b0f2aSSebastian Grimberg 
106672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
1079e201c85SYohann   }
1089e201c85SYohann }
1099e201c85SYohann 
1100183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)1110183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
1120183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
1130183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
1140183ed61SJeremy L Thompson   const CeedInt target_comp = n / P_1D;
1150183ed61SJeremy L Thompson   const CeedInt target_node = n % P_1D;
1160183ed61SJeremy L Thompson 
1170183ed61SJeremy L Thompson   if (data.t_id_x == target_node) {
1180183ed61SJeremy L Thompson     const CeedInt ind = indices[target_node + elem * P_1D];
1190183ed61SJeremy L Thompson 
1200183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
1210183ed61SJeremy L Thompson   }
1220183ed61SJeremy L Thompson }
1230183ed61SJeremy L Thompson 
1249e201c85SYohann //------------------------------------------------------------------------------
125692716b7SZach Atkins // E-vector -> L-vector, full assembly
126692716b7SZach Atkins //------------------------------------------------------------------------------
127692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)128692716b7SZach Atkins inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
129692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
130692716b7SZach Atkins   const CeedInt in_comp    = in / P_1D;
131692716b7SZach Atkins   const CeedInt in_node    = in % P_1D;
132692716b7SZach Atkins   const CeedInt e_vec_size = P_1D * NUM_COMP;
133692716b7SZach Atkins 
134692716b7SZach Atkins   if (data.t_id_x < P_1D) {
135692716b7SZach Atkins     const CeedInt out_node = data.t_id_x;
136692716b7SZach Atkins 
137692716b7SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138692716b7SZach Atkins       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
139692716b7SZach Atkins     }
140692716b7SZach Atkins   }
141692716b7SZach Atkins }
142692716b7SZach Atkins 
143692716b7SZach Atkins //------------------------------------------------------------------------------
1445daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
1455daefc96SJeremy L Thompson //------------------------------------------------------------------------------
1465daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard1d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)1475daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
1485daefc96SJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
1495daefc96SJeremy L Thompson   if (data.t_id_x < Q_1D) {
1505daefc96SJeremy L Thompson     const CeedInt ind = data.t_id_x + elem * Q_1D;
1515daefc96SJeremy L Thompson 
1525daefc96SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
1535daefc96SJeremy L Thompson       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
1545daefc96SJeremy L Thompson     }
1555daefc96SJeremy L Thompson   }
1565daefc96SJeremy L Thompson }
1575daefc96SJeremy L Thompson 
1585daefc96SJeremy L Thompson //------------------------------------------------------------------------------
1599e201c85SYohann // E-vector -> L-vector, strided
1609e201c85SYohann //------------------------------------------------------------------------------
1616b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided1d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)162f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
1646b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D) {
1659e201c85SYohann     const CeedInt node = data.t_id_x;
1669e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
167672b0f2aSSebastian Grimberg 
168672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1699e201c85SYohann   }
1709e201c85SYohann }
1719e201c85SYohann 
1729e201c85SYohann //------------------------------------------------------------------------------
1739e201c85SYohann // 2D
1749e201c85SYohann //------------------------------------------------------------------------------
1759e201c85SYohann 
1769e201c85SYohann //------------------------------------------------------------------------------
1770183ed61SJeremy L Thompson // Set E-vector value
1780183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1790183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard2d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)1800183ed61SJeremy L Thompson inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
1810183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
1820183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
1830183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
1840183ed61SJeremy L Thompson 
1850183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
1860183ed61SJeremy L Thompson     r_v[target_comp] = value;
1870183ed61SJeremy L Thompson   }
1880183ed61SJeremy L Thompson }
1890183ed61SJeremy L Thompson 
1900183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1919e201c85SYohann // L-vector -> E-vector, offsets provided
1929e201c85SYohann //------------------------------------------------------------------------------
1936b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard2d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)194f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
195672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
1966b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
1976b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
1986b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
199672b0f2aSSebastian Grimberg 
200672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
2019e201c85SYohann   }
2029e201c85SYohann }
2039e201c85SYohann 
2049e201c85SYohann //------------------------------------------------------------------------------
2059e201c85SYohann // L-vector -> E-vector, strided
2069e201c85SYohann //------------------------------------------------------------------------------
2076b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided2d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)208f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
2096b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2106b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2119e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
212672b0f2aSSebastian Grimberg 
213672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2149e201c85SYohann   }
2159e201c85SYohann }
2169e201c85SYohann 
2179e201c85SYohann //------------------------------------------------------------------------------
2189e201c85SYohann // E-vector -> L-vector, offsets provided
2199e201c85SYohann //------------------------------------------------------------------------------
2206b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)221f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
222672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
2236b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2246b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2256b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
226672b0f2aSSebastian Grimberg 
227672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
2289e201c85SYohann   }
2299e201c85SYohann }
2309e201c85SYohann 
2310183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)2320183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
2330183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
2340183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
2350183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D);
2360183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
2370183ed61SJeremy L Thompson   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
2380183ed61SJeremy L Thompson 
2390183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
2400183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2410183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
2420183ed61SJeremy L Thompson 
2430183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
2440183ed61SJeremy L Thompson   }
2450183ed61SJeremy L Thompson }
2460183ed61SJeremy L Thompson 
2479e201c85SYohann //------------------------------------------------------------------------------
248692716b7SZach Atkins // E-vector -> L-vector, full assembly
249692716b7SZach Atkins //------------------------------------------------------------------------------
250692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)251692716b7SZach Atkins inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
252692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
253692716b7SZach Atkins   const CeedInt elem_size  = P_1D * P_1D;
254692716b7SZach Atkins   const CeedInt in_comp    = in / elem_size;
255692716b7SZach Atkins   const CeedInt in_node_x  = in % P_1D;
256692716b7SZach Atkins   const CeedInt in_node_y  = (in % elem_size) / P_1D;
257692716b7SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
258692716b7SZach Atkins 
259692716b7SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
260692716b7SZach Atkins     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
261692716b7SZach Atkins     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
262692716b7SZach Atkins 
263692716b7SZach Atkins     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
264692716b7SZach Atkins       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
265692716b7SZach Atkins 
266692716b7SZach Atkins       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
267692716b7SZach Atkins     }
268692716b7SZach Atkins   }
269692716b7SZach Atkins }
270692716b7SZach Atkins 
271692716b7SZach Atkins //------------------------------------------------------------------------------
2725daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
2735daefc96SJeremy L Thompson //------------------------------------------------------------------------------
2745daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard2d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)2755daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
2765daefc96SJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
2775daefc96SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
2785daefc96SJeremy L Thompson     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
2795daefc96SJeremy L Thompson 
2805daefc96SJeremy L Thompson     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
2815daefc96SJeremy L Thompson       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
2825daefc96SJeremy L Thompson     }
2835daefc96SJeremy L Thompson   }
2845daefc96SJeremy L Thompson }
2855daefc96SJeremy L Thompson 
2865daefc96SJeremy L Thompson //------------------------------------------------------------------------------
2879e201c85SYohann // E-vector -> L-vector, strided
2889e201c85SYohann //------------------------------------------------------------------------------
2896b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided2d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)290f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
291672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
2926b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
2936b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
2949e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
295672b0f2aSSebastian Grimberg 
296672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2979e201c85SYohann   }
2989e201c85SYohann }
2999e201c85SYohann 
3009e201c85SYohann //------------------------------------------------------------------------------
3019e201c85SYohann // 3D
3029e201c85SYohann //------------------------------------------------------------------------------
3039e201c85SYohann 
3049e201c85SYohann //------------------------------------------------------------------------------
3050183ed61SJeremy L Thompson // Set E-vector value
3060183ed61SJeremy L Thompson //------------------------------------------------------------------------------
3070183ed61SJeremy L Thompson template <int NUM_COMP, int P_1D>
SetEVecStandard3d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)3080183ed61SJeremy L Thompson inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
3090183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
3100183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
3110183ed61SJeremy L Thompson   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
3120183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
3130183ed61SJeremy L Thompson 
3140183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
3150183ed61SJeremy L Thompson     r_v[target_node_z + target_comp * P_1D] = value;
3160183ed61SJeremy L Thompson   }
3170183ed61SJeremy L Thompson }
3180183ed61SJeremy L Thompson 
3190183ed61SJeremy L Thompson //------------------------------------------------------------------------------
3209e201c85SYohann // L-vector -> E-vector, offsets provided
3219e201c85SYohann //------------------------------------------------------------------------------
3226b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard3d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)323f815fac9SJeremy L Thompson inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
324672b0f2aSSebastian Grimberg                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
3256b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3266b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
3276b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3286b92dc4bSJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
329672b0f2aSSebastian Grimberg 
3306b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
3319e201c85SYohann     }
3329e201c85SYohann   }
333688b5473SJeremy L Thompson }
3349e201c85SYohann 
3359e201c85SYohann //------------------------------------------------------------------------------
3369e201c85SYohann // L-vector -> E-vector, strided
3379e201c85SYohann //------------------------------------------------------------------------------
3386b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)339f815fac9SJeremy L Thompson inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
3406b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3416b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
3426b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3439e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
344672b0f2aSSebastian Grimberg 
3456b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
3469e201c85SYohann     }
3479e201c85SYohann   }
348688b5473SJeremy L Thompson }
3499e201c85SYohann 
3509e201c85SYohann //------------------------------------------------------------------------------
3519e201c85SYohann // E-vector -> Q-vector, offests provided
3529e201c85SYohann //------------------------------------------------------------------------------
3536b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
ReadEVecSliceStandard3d(SharedData_Hip & data,const CeedInt nquads,const CeedInt elem,const CeedInt q,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)354f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
355f815fac9SJeremy L Thompson                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
356f815fac9SJeremy L Thompson                                                CeedScalar *__restrict__ r_u) {
3576b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3586b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3596b92dc4bSJeremy L Thompson     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
360672b0f2aSSebastian Grimberg 
361672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
3629e201c85SYohann   }
3639e201c85SYohann }
3649e201c85SYohann 
3659e201c85SYohann //------------------------------------------------------------------------------
3669e201c85SYohann // E-vector -> Q-vector, strided
3679e201c85SYohann //------------------------------------------------------------------------------
3686b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadEVecSliceStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedInt q,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)369f815fac9SJeremy L Thompson inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
370672b0f2aSSebastian Grimberg                                               CeedScalar *__restrict__ r_u) {
3716b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
3726b92dc4bSJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
3739e201c85SYohann     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
374672b0f2aSSebastian Grimberg 
375672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
3769e201c85SYohann   }
3779e201c85SYohann }
3789e201c85SYohann 
3799e201c85SYohann //------------------------------------------------------------------------------
3809e201c85SYohann // E-vector -> L-vector, offsets provided
3819e201c85SYohann //------------------------------------------------------------------------------
3826b92dc4bSJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)383f815fac9SJeremy L Thompson inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
384672b0f2aSSebastian Grimberg                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
3856b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
3866b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
3876b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
3886b92dc4bSJeremy L Thompson       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
389672b0f2aSSebastian Grimberg 
3906b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
3919e201c85SYohann     }
3929e201c85SYohann   }
393688b5473SJeremy L Thompson }
3949e201c85SYohann 
3950183ed61SJeremy L Thompson template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)3960183ed61SJeremy L Thompson inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
3970183ed61SJeremy L Thompson                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
3980183ed61SJeremy L Thompson                                                   CeedScalar *__restrict__ d_v) {
3990183ed61SJeremy L Thompson   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
4000183ed61SJeremy L Thompson   const CeedInt target_node_x = n % P_1D;
4010183ed61SJeremy L Thompson   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
4020183ed61SJeremy L Thompson   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
4030183ed61SJeremy L Thompson 
4040183ed61SJeremy L Thompson   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
4050183ed61SJeremy L Thompson     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
4060183ed61SJeremy L Thompson     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
4070183ed61SJeremy L Thompson 
4080183ed61SJeremy L Thompson     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
4090183ed61SJeremy L Thompson   }
4100183ed61SJeremy L Thompson }
4110183ed61SJeremy L Thompson 
4129e201c85SYohann //------------------------------------------------------------------------------
413692716b7SZach Atkins // E-vector -> L-vector, full assembly
414692716b7SZach Atkins //------------------------------------------------------------------------------
415692716b7SZach Atkins template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)416692716b7SZach Atkins inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
417692716b7SZach Atkins                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
418692716b7SZach Atkins   const CeedInt elem_size  = P_1D * P_1D * P_1D;
419692716b7SZach Atkins   const CeedInt in_comp    = in / elem_size;
420692716b7SZach Atkins   const CeedInt in_node_x  = in % P_1D;
421692716b7SZach Atkins   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
422692716b7SZach Atkins   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
423692716b7SZach Atkins   const CeedInt e_vec_size = elem_size * NUM_COMP;
424692716b7SZach Atkins 
425692716b7SZach Atkins   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
426692716b7SZach Atkins     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
427692716b7SZach Atkins     for (CeedInt z = 0; z < P_1D; z++) {
428692716b7SZach Atkins       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
429692716b7SZach Atkins 
430692716b7SZach Atkins       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
431692716b7SZach Atkins         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
432692716b7SZach Atkins 
433692716b7SZach Atkins         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
434692716b7SZach Atkins       }
435692716b7SZach Atkins     }
436692716b7SZach Atkins   }
437692716b7SZach Atkins }
438692716b7SZach Atkins 
439692716b7SZach Atkins //------------------------------------------------------------------------------
4405daefc96SJeremy L Thompson // E-vector -> L-vector, Qfunction assembly
4415daefc96SJeremy L Thompson //------------------------------------------------------------------------------
4425daefc96SJeremy L Thompson template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard3d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)4435daefc96SJeremy L Thompson inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
4445daefc96SJeremy L Thompson                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
4455daefc96SJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
4465daefc96SJeremy L Thompson     for (CeedInt z = 0; z < Q_1D; z++) {
4475daefc96SJeremy L Thompson       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
4485daefc96SJeremy L Thompson 
4495daefc96SJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
4505daefc96SJeremy L Thompson         d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
4515daefc96SJeremy L Thompson       }
4525daefc96SJeremy L Thompson     }
4535daefc96SJeremy L Thompson   }
4545daefc96SJeremy L Thompson }
4555daefc96SJeremy L Thompson 
4565daefc96SJeremy L Thompson //------------------------------------------------------------------------------
4579e201c85SYohann // E-vector -> L-vector, strided
4589e201c85SYohann //------------------------------------------------------------------------------
4596b92dc4bSJeremy L Thompson template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)460f815fac9SJeremy L Thompson inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
461672b0f2aSSebastian Grimberg                                           CeedScalar *__restrict__ d_v) {
4626b92dc4bSJeremy L Thompson   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
4636b92dc4bSJeremy L Thompson     for (CeedInt z = 0; z < P_1D; z++) {
4646b92dc4bSJeremy L Thompson       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
4659e201c85SYohann       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
466672b0f2aSSebastian Grimberg 
4676b92dc4bSJeremy L Thompson       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
4689e201c85SYohann     }
4699e201c85SYohann   }
470688b5473SJeremy L Thompson }
4719e201c85SYohann 
4729e201c85SYohann //------------------------------------------------------------------------------
4739e201c85SYohann // 3D collocated derivatives computation
4749e201c85SYohann //------------------------------------------------------------------------------
4756b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSlice3d(SharedData_Hip & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)476f815fac9SJeremy L Thompson inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
4772b730f8bSJeremy L Thompson                                         CeedScalar *__restrict__ r_V) {
4786b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
479672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480d6c19ee8SJeremy L Thompson       __syncthreads();
4816b92dc4bSJeremy L Thompson       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
4829e201c85SYohann       __syncthreads();
4839e201c85SYohann       // X derivative
484672b0f2aSSebastian Grimberg       r_V[comp + 0 * NUM_COMP] = 0.0;
4856b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4866b92dc4bSJeremy L Thompson         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
487688b5473SJeremy L Thompson       }
4889e201c85SYohann       // Y derivative
489672b0f2aSSebastian Grimberg       r_V[comp + 1 * NUM_COMP] = 0.0;
4906b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4916b92dc4bSJeremy L Thompson         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
492688b5473SJeremy L Thompson       }
4939e201c85SYohann       // Z derivative
494672b0f2aSSebastian Grimberg       r_V[comp + 2 * NUM_COMP] = 0.0;
4956b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
4966b92dc4bSJeremy L Thompson         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
497688b5473SJeremy L Thompson       }
4989e201c85SYohann     }
4999e201c85SYohann   }
5009e201c85SYohann }
5019e201c85SYohann 
5029e201c85SYohann //------------------------------------------------------------------------------
5039e201c85SYohann // 3D collocated derivatives transpose
5049e201c85SYohann //------------------------------------------------------------------------------
5056b92dc4bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSliceTranspose3d(SharedData_Hip & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)506f815fac9SJeremy L Thompson inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
5072b730f8bSJeremy L Thompson                                                  CeedScalar *__restrict__ r_V) {
5086b92dc4bSJeremy L Thompson   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
509672b0f2aSSebastian Grimberg     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5109e201c85SYohann       // X derivative
511d6c19ee8SJeremy L Thompson       __syncthreads();
512672b0f2aSSebastian Grimberg       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
5139e201c85SYohann       __syncthreads();
5146b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
5156b92dc4bSJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
516688b5473SJeremy L Thompson       }
5179e201c85SYohann       // Y derivative
518d6c19ee8SJeremy L Thompson       __syncthreads();
519672b0f2aSSebastian Grimberg       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
5209e201c85SYohann       __syncthreads();
5216b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
5226b92dc4bSJeremy L Thompson         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
523688b5473SJeremy L Thompson       }
5249e201c85SYohann       // Z derivative
5256b92dc4bSJeremy L Thompson       for (CeedInt i = 0; i < Q_1D; i++) {
5266b92dc4bSJeremy L Thompson         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
527688b5473SJeremy L Thompson       }
5289e201c85SYohann     }
5299e201c85SYohann   }
5309e201c85SYohann }
531