xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h (revision 9e1d4b8291fc4f4e19d20cdfdecd866260d0e6d2)
1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*9e1d4b82SJeremy L Thompson //
4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*9e1d4b82SJeremy L Thompson //
6*9e1d4b82SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*9e1d4b82SJeremy L Thompson 
8*9e1d4b82SJeremy L Thompson /// @file
9*9e1d4b82SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
10*9e1d4b82SJeremy L Thompson #include <ceed/types.h>
11*9e1d4b82SJeremy L Thompson 
12*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h"
13*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-at-points-templates.h"
14*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-templates.h"
15*9e1d4b82SJeremy L Thompson 
16*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
17*9e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints
18*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
19*9e1d4b82SJeremy L Thompson 
20*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
21*9e1d4b82SJeremy L Thompson // Interp
22*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
23*9e1d4b82SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
24*9e1d4b82SJeremy L Thompson                                           const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
25*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
26*9e1d4b82SJeremy L Thompson 
27*9e1d4b82SJeremy L Thompson   SharedData_Cuda data;
28*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
29*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
30*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
31*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
32*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
33*9e1d4b82SJeremy L Thompson 
34*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
35*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
36*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP];
37*9e1d4b82SJeremy L Thompson 
38*9e1d4b82SJeremy L Thompson   // Apply basis element by element
39*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
40*9e1d4b82SJeremy L Thompson     // Map to coefficients
41*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
42*9e1d4b82SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
43*9e1d4b82SJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
44*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
45*9e1d4b82SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
46*9e1d4b82SJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
47*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
48*9e1d4b82SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
49*9e1d4b82SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
50*9e1d4b82SJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
51*9e1d4b82SJeremy L Thompson     }
52*9e1d4b82SJeremy L Thompson 
53*9e1d4b82SJeremy L Thompson     // Map to points
54*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
55*9e1d4b82SJeremy L Thompson 
56*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
57*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
58*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
59*9e1d4b82SJeremy L Thompson 
60*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
61*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
62*9e1d4b82SJeremy L Thompson       }
63*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
64*9e1d4b82SJeremy L Thompson         InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
65*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
66*9e1d4b82SJeremy L Thompson         InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
67*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
68*9e1d4b82SJeremy L Thompson         InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
69*9e1d4b82SJeremy L Thompson       }
70*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
71*9e1d4b82SJeremy L Thompson         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
72*9e1d4b82SJeremy L Thompson       }
73*9e1d4b82SJeremy L Thompson     }
74*9e1d4b82SJeremy L Thompson   }
75*9e1d4b82SJeremy L Thompson }
76*9e1d4b82SJeremy L Thompson 
77*9e1d4b82SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
78*9e1d4b82SJeremy L Thompson                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
79*9e1d4b82SJeremy L Thompson                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
80*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
81*9e1d4b82SJeremy L Thompson 
82*9e1d4b82SJeremy L Thompson   SharedData_Cuda data;
83*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
84*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
85*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
86*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
87*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
88*9e1d4b82SJeremy L Thompson 
89*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP];
90*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
91*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
92*9e1d4b82SJeremy L Thompson 
93*9e1d4b82SJeremy L Thompson   // Apply basis element by element
94*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
95*9e1d4b82SJeremy L Thompson     // Clear register
96*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
97*9e1d4b82SJeremy L Thompson 
98*9e1d4b82SJeremy L Thompson     // Map from points
99*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
100*9e1d4b82SJeremy L Thompson 
101*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
102*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
103*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
104*9e1d4b82SJeremy L Thompson 
105*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
106*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
107*9e1d4b82SJeremy L Thompson       }
108*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
109*9e1d4b82SJeremy L Thompson         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
110*9e1d4b82SJeremy L Thompson         else r_U[j] = 0.0;
111*9e1d4b82SJeremy L Thompson       }
112*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
113*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
114*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
115*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
116*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
117*9e1d4b82SJeremy L Thompson         InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
118*9e1d4b82SJeremy L Thompson       }
119*9e1d4b82SJeremy L Thompson     }
120*9e1d4b82SJeremy L Thompson     __syncthreads();
121*9e1d4b82SJeremy L Thompson 
122*9e1d4b82SJeremy L Thompson     // Map from coefficients
123*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
124*9e1d4b82SJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
125*9e1d4b82SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
126*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
127*9e1d4b82SJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
128*9e1d4b82SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
129*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
130*9e1d4b82SJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
131*9e1d4b82SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
132*9e1d4b82SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
133*9e1d4b82SJeremy L Thompson     }
134*9e1d4b82SJeremy L Thompson   }
135*9e1d4b82SJeremy L Thompson }
136*9e1d4b82SJeremy L Thompson 
137*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
138*9e1d4b82SJeremy L Thompson // Grad
139*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------
140*9e1d4b82SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
141*9e1d4b82SJeremy L Thompson                                         const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
142*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
143*9e1d4b82SJeremy L Thompson 
144*9e1d4b82SJeremy L Thompson   SharedData_Cuda data;
145*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
146*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
147*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
148*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
149*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
150*9e1d4b82SJeremy L Thompson 
151*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
152*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
153*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
154*9e1d4b82SJeremy L Thompson 
155*9e1d4b82SJeremy L Thompson   // Apply basis element by element
156*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
157*9e1d4b82SJeremy L Thompson     // Map to coefficients
158*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
159*9e1d4b82SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
160*9e1d4b82SJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
161*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
162*9e1d4b82SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
163*9e1d4b82SJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
164*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
165*9e1d4b82SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
166*9e1d4b82SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
167*9e1d4b82SJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
168*9e1d4b82SJeremy L Thompson     }
169*9e1d4b82SJeremy L Thompson 
170*9e1d4b82SJeremy L Thompson     // Map to points
171*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
172*9e1d4b82SJeremy L Thompson 
173*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
174*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
175*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
176*9e1d4b82SJeremy L Thompson 
177*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
178*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
179*9e1d4b82SJeremy L Thompson       }
180*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
181*9e1d4b82SJeremy L Thompson         GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
182*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
183*9e1d4b82SJeremy L Thompson         GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
184*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
185*9e1d4b82SJeremy L Thompson         GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
186*9e1d4b82SJeremy L Thompson       }
187*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
188*9e1d4b82SJeremy L Thompson         if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
189*9e1d4b82SJeremy L Thompson       }
190*9e1d4b82SJeremy L Thompson     }
191*9e1d4b82SJeremy L Thompson   }
192*9e1d4b82SJeremy L Thompson }
193*9e1d4b82SJeremy L Thompson 
194*9e1d4b82SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
195*9e1d4b82SJeremy L Thompson                                                  const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
196*9e1d4b82SJeremy L Thompson                                                  const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
197*9e1d4b82SJeremy L Thompson   extern __shared__ CeedScalar slice[];
198*9e1d4b82SJeremy L Thompson 
199*9e1d4b82SJeremy L Thompson   SharedData_Cuda data;
200*9e1d4b82SJeremy L Thompson   data.t_id_x = threadIdx.x;
201*9e1d4b82SJeremy L Thompson   data.t_id_y = threadIdx.y;
202*9e1d4b82SJeremy L Thompson   data.t_id_z = threadIdx.z;
203*9e1d4b82SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
204*9e1d4b82SJeremy L Thompson   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
205*9e1d4b82SJeremy L Thompson 
206*9e1d4b82SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
207*9e1d4b82SJeremy L Thompson   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
208*9e1d4b82SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
209*9e1d4b82SJeremy L Thompson 
210*9e1d4b82SJeremy L Thompson   // Apply basis element by element
211*9e1d4b82SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
212*9e1d4b82SJeremy L Thompson     // Clear register
213*9e1d4b82SJeremy L Thompson     for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
214*9e1d4b82SJeremy L Thompson 
215*9e1d4b82SJeremy L Thompson     // Map from points
216*9e1d4b82SJeremy L Thompson     const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
217*9e1d4b82SJeremy L Thompson 
218*9e1d4b82SJeremy L Thompson     for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
219*9e1d4b82SJeremy L Thompson       const CeedInt p = i % BASIS_NUM_PTS;
220*9e1d4b82SJeremy L Thompson       CeedScalar    r_X[BASIS_DIM];
221*9e1d4b82SJeremy L Thompson 
222*9e1d4b82SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
223*9e1d4b82SJeremy L Thompson         r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
224*9e1d4b82SJeremy L Thompson       }
225*9e1d4b82SJeremy L Thompson       for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
226*9e1d4b82SJeremy L Thompson         if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
227*9e1d4b82SJeremy L Thompson         else r_U[j] = 0.0;
228*9e1d4b82SJeremy L Thompson       }
229*9e1d4b82SJeremy L Thompson       if (BASIS_DIM == 1) {
230*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
231*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 2) {
232*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
233*9e1d4b82SJeremy L Thompson       } else if (BASIS_DIM == 3) {
234*9e1d4b82SJeremy L Thompson         GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
235*9e1d4b82SJeremy L Thompson       }
236*9e1d4b82SJeremy L Thompson     }
237*9e1d4b82SJeremy L Thompson     __syncthreads();
238*9e1d4b82SJeremy L Thompson 
239*9e1d4b82SJeremy L Thompson     // Map from coefficients
240*9e1d4b82SJeremy L Thompson     if (BASIS_DIM == 1) {
241*9e1d4b82SJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
242*9e1d4b82SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
243*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 2) {
244*9e1d4b82SJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
245*9e1d4b82SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
246*9e1d4b82SJeremy L Thompson     } else if (BASIS_DIM == 3) {
247*9e1d4b82SJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
248*9e1d4b82SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
249*9e1d4b82SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
250*9e1d4b82SJeremy L Thompson     }
251*9e1d4b82SJeremy L Thompson   }
252*9e1d4b82SJeremy L Thompson }
253