xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h (revision 9e1d4b8291fc4f4e19d20cdfdecd866260d0e6d2)
1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e1d4b82SJeremy L Thompson //
4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*9e1d4b82SJeremy L Thompson //
6*9e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*9e1d4b82SJeremy L Thompson 
8*9e1d4b82SJeremy L Thompson /// @file
9*9e1d4b82SJeremy L Thompson /// Internal header for HIP tensor product basis with AtPoints evaluation
10*9e1d4b82SJeremy L Thompson #include <ceed/types.h>
11*9e1d4b82SJeremy L Thompson 
12*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-read-write-templates.h"
13*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-at-points-templates.h"
14*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-templates.h"
15*9e1d4b82SJeremy L Thompson 
16*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
17*9e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints
18*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
19*9e1d4b82SJeremy L Thompson 
20*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
21*9e1d4b82SJeremy L Thompson // Interp
22*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
23*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
24*9e1d4b82SJeremy L Thompson     void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
25*9e1d4b82SJeremy L Thompson                         const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
26*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
27*9e1d4b82SJeremy L Thompson 
28*9e1d4b82SJeremy L Thompson   // load chebyshev_interp_1d into shared memory
29*9e1d4b82SJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
30*9e1d4b82SJeremy L Thompson   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
31*9e1d4b82SJeremy L Thompson   __syncthreads();
32*9e1d4b82SJeremy L Thompson 
33*9e1d4b82SJeremy L Thompson   SharedData_Hip data;
34*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
35*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
36*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
37*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
38*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
39*9e1d4b82SJeremy L Thompson 
40*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
41*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
42*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
43*9e1d4b82SJeremy L Thompson 
44*9e1d4b82SJeremy L Thompson   // Apply basis element by element
45*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46*9e1d4b82SJeremy L Thompson     // Map to coefficients
47*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
48*9e1d4b82SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
49*9e1d4b82SJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
50*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
51*9e1d4b82SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
52*9e1d4b82SJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
53*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
54*9e1d4b82SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
55*9e1d4b82SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
56*9e1d4b82SJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
57*9e1d4b82SJeremy L Thompson     }
58*9e1d4b82SJeremy L Thompson 
59*9e1d4b82SJeremy L Thompson     // Map to points
60*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
61*9e1d4b82SJeremy L Thompson 
62*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
63*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
64*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
65*9e1d4b82SJeremy L Thompson 
66*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
67*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
68*9e1d4b82SJeremy L Thompson       }
69*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
70*9e1d4b82SJeremy L Thompson         InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
71*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
72*9e1d4b82SJeremy L Thompson         InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
73*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
74*9e1d4b82SJeremy L Thompson         InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
75*9e1d4b82SJeremy L Thompson       }
76*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
77*9e1d4b82SJeremy L Thompson         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
78*9e1d4b82SJeremy L Thompson       }
79*9e1d4b82SJeremy L Thompson     }
80*9e1d4b82SJeremy L Thompson   }
81*9e1d4b82SJeremy L Thompson }
82*9e1d4b82SJeremy L Thompson 
83*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
84*9e1d4b82SJeremy L Thompson     void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
85*9e1d4b82SJeremy L Thompson                                  const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
86*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
87*9e1d4b82SJeremy L Thompson 
88*9e1d4b82SJeremy L Thompson   // load chebyshev_interp_1d into shared memory
89*9e1d4b82SJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
90*9e1d4b82SJeremy L Thompson   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
91*9e1d4b82SJeremy L Thompson   __syncthreads();
92*9e1d4b82SJeremy L Thompson 
93*9e1d4b82SJeremy L Thompson   SharedData_Hip data;
94*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
95*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
96*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
97*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
98*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
99*9e1d4b82SJeremy L Thompson 
100*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
101*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
102*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
103*9e1d4b82SJeremy L Thompson 
104*9e1d4b82SJeremy L Thompson   // Apply basis element by element
105*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
106*9e1d4b82SJeremy L Thompson     // Clear register
107*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
108*9e1d4b82SJeremy L Thompson 
109*9e1d4b82SJeremy L Thompson     // Map from points
110*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
111*9e1d4b82SJeremy L Thompson 
112*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
113*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
114*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
115*9e1d4b82SJeremy L Thompson 
116*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
117*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
118*9e1d4b82SJeremy L Thompson       }
119*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
120*9e1d4b82SJeremy L Thompson         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
121*9e1d4b82SJeremy L Thompson         else r_U[j] = 0.0;
122*9e1d4b82SJeremy L Thompson       }
123*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
124*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
125*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
126*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
127*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
128*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
129*9e1d4b82SJeremy L Thompson       }
130*9e1d4b82SJeremy L Thompson     }
131*9e1d4b82SJeremy L Thompson     __syncthreads();
132*9e1d4b82SJeremy L Thompson 
133*9e1d4b82SJeremy L Thompson     // Map from coefficients
134*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
135*9e1d4b82SJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
136*9e1d4b82SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
137*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
138*9e1d4b82SJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
139*9e1d4b82SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
140*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
141*9e1d4b82SJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
142*9e1d4b82SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
143*9e1d4b82SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
144*9e1d4b82SJeremy L Thompson     }
145*9e1d4b82SJeremy L Thompson   }
146*9e1d4b82SJeremy L Thompson }
147*9e1d4b82SJeremy L Thompson 
148*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
149*9e1d4b82SJeremy L Thompson // Grad
150*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
151*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
152*9e1d4b82SJeremy L Thompson     void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
153*9e1d4b82SJeremy L Thompson                       const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
154*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
155*9e1d4b82SJeremy L Thompson 
156*9e1d4b82SJeremy L Thompson   // load chebyshev_interp_1d into shared memory
157*9e1d4b82SJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
158*9e1d4b82SJeremy L Thompson   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
159*9e1d4b82SJeremy L Thompson   __syncthreads();
160*9e1d4b82SJeremy L Thompson 
161*9e1d4b82SJeremy L Thompson   SharedData_Hip data;
162*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
163*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
164*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
165*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
166*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
167*9e1d4b82SJeremy L Thompson 
168*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
169*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
170*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
171*9e1d4b82SJeremy L Thompson 
172*9e1d4b82SJeremy L Thompson   // Apply basis element by element
173*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
174*9e1d4b82SJeremy L Thompson     // Map to coefficients
175*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
176*9e1d4b82SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
177*9e1d4b82SJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
178*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
179*9e1d4b82SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
180*9e1d4b82SJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
181*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
182*9e1d4b82SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
183*9e1d4b82SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
184*9e1d4b82SJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
185*9e1d4b82SJeremy L Thompson     }
186*9e1d4b82SJeremy L Thompson 
187*9e1d4b82SJeremy L Thompson     // Map to points
188*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
189*9e1d4b82SJeremy L Thompson 
190*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
191*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
192*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
193*9e1d4b82SJeremy L Thompson 
194*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
195*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
196*9e1d4b82SJeremy L Thompson       }
197*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
198*9e1d4b82SJeremy L Thompson         GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
199*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
200*9e1d4b82SJeremy L Thompson         GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
201*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
202*9e1d4b82SJeremy L Thompson         GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
203*9e1d4b82SJeremy L Thompson       }
204*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
205*9e1d4b82SJeremy L Thompson         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
206*9e1d4b82SJeremy L Thompson       }
207*9e1d4b82SJeremy L Thompson     }
208*9e1d4b82SJeremy L Thompson   }
209*9e1d4b82SJeremy L Thompson }
210*9e1d4b82SJeremy L Thompson 
211*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
212*9e1d4b82SJeremy L Thompson     void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
213*9e1d4b82SJeremy L Thompson                                const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
214*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
215*9e1d4b82SJeremy L Thompson 
216*9e1d4b82SJeremy L Thompson   // load chebyshev_interp_1d into shared memory
217*9e1d4b82SJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
218*9e1d4b82SJeremy L Thompson   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
219*9e1d4b82SJeremy L Thompson   __syncthreads();
220*9e1d4b82SJeremy L Thompson 
221*9e1d4b82SJeremy L Thompson   SharedData_Hip data;
222*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
223*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
224*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
225*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
226*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
227*9e1d4b82SJeremy L Thompson 
228*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
229*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
230*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
231*9e1d4b82SJeremy L Thompson 
232*9e1d4b82SJeremy L Thompson   // Apply basis element by element
233*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
234*9e1d4b82SJeremy L Thompson     // Clear register
235*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
236*9e1d4b82SJeremy L Thompson 
237*9e1d4b82SJeremy L Thompson     // Map from points
238*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
239*9e1d4b82SJeremy L Thompson 
240*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
241*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
242*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
243*9e1d4b82SJeremy L Thompson 
244*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
245*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
246*9e1d4b82SJeremy L Thompson       }
247*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
248*9e1d4b82SJeremy L Thompson         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
249*9e1d4b82SJeremy L Thompson         else r_U[j] = 0.0;
250*9e1d4b82SJeremy L Thompson       }
251*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
252*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
253*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
254*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
255*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
256*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
257*9e1d4b82SJeremy L Thompson       }
258*9e1d4b82SJeremy L Thompson     }
259*9e1d4b82SJeremy L Thompson     __syncthreads();
260*9e1d4b82SJeremy L Thompson 
261*9e1d4b82SJeremy L Thompson     // Map from coefficients
262*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
263*9e1d4b82SJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
264*9e1d4b82SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
265*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
266*9e1d4b82SJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
267*9e1d4b82SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
268*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
269*9e1d4b82SJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
270*9e1d4b82SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
271*9e1d4b82SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
272*9e1d4b82SJeremy L Thompson     }
273*9e1d4b82SJeremy L Thompson   }
274*9e1d4b82SJeremy L Thompson }
275