xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor.h (revision 02219a082eb38cf2d3edc97fbfe55fa395a4dc99)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP shared memory tensor product basis
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
112b730f8bSJeremy L Thompson 
129e201c85SYohann #include "hip-shared-basis-read-write-templates.h"
139e201c85SYohann #include "hip-shared-basis-tensor-templates.h"
149e201c85SYohann 
159e201c85SYohann //------------------------------------------------------------------------------
169e201c85SYohann // Interp kernel by dim
179e201c85SYohann //------------------------------------------------------------------------------
182b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
19aa4002adSJeremy L Thompson     void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
209e201c85SYohann   extern __shared__ CeedScalar slice[];
21b2165e7aSSebastian Grimberg 
229e201c85SYohann   SharedData_Hip data;
239e201c85SYohann   data.t_id_x = threadIdx.x;
249e201c85SYohann   data.t_id_y = threadIdx.y;
259e201c85SYohann   data.t_id_z = threadIdx.z;
269e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
276b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
289e201c85SYohann 
299e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
309e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
319e201c85SYohann 
32aa4002adSJeremy L Thompson   // load interp_1d into shared memory
33aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
34aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
35aa4002adSJeremy L Thompson   __syncthreads();
36aa4002adSJeremy L Thompson 
37aa4002adSJeremy L Thompson   // Apply basis element by element
389e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
399e201c85SYohann     if (BASIS_DIM == 1) {
409e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
416b92dc4bSJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
429e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
439e201c85SYohann     } else if (BASIS_DIM == 2) {
449e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
456b92dc4bSJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
469e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
479e201c85SYohann     } else if (BASIS_DIM == 3) {
482b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
492b730f8bSJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
506b92dc4bSJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
512b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
522b730f8bSJeremy L Thompson                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
539e201c85SYohann     }
549e201c85SYohann   }
559e201c85SYohann }
569e201c85SYohann 
572b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
58*02219a08SJeremy L Thompson     void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
59*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
60*02219a08SJeremy L Thompson 
61*02219a08SJeremy L Thompson   SharedData_Hip data;
62*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
63*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
64*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
65*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
66*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
67*02219a08SJeremy L Thompson 
68*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
69*02219a08SJeremy L Thompson 
70*02219a08SJeremy L Thompson   // Apply basis element by element
71*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
72*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
73*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
74*02219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
75*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
76*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
77*02219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
78*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
79*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
80*02219a08SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
81*02219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
82*02219a08SJeremy L Thompson                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
83*02219a08SJeremy L Thompson     }
84*02219a08SJeremy L Thompson   }
85*02219a08SJeremy L Thompson }
86*02219a08SJeremy L Thompson 
87*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
88aa4002adSJeremy L Thompson     void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
899e201c85SYohann   extern __shared__ CeedScalar slice[];
90b2165e7aSSebastian Grimberg 
919e201c85SYohann   SharedData_Hip data;
929e201c85SYohann   data.t_id_x = threadIdx.x;
939e201c85SYohann   data.t_id_y = threadIdx.y;
949e201c85SYohann   data.t_id_z = threadIdx.z;
959e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
966b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
979e201c85SYohann 
989e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
999e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
1009e201c85SYohann 
101aa4002adSJeremy L Thompson   // load interp_1d into shared memory
102aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
103aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
104aa4002adSJeremy L Thompson   __syncthreads();
105aa4002adSJeremy L Thompson 
106aa4002adSJeremy L Thompson   // Apply basis element by element
1079e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1089e201c85SYohann     if (BASIS_DIM == 1) {
1099e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1106b92dc4bSJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1119e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
1129e201c85SYohann     } else if (BASIS_DIM == 2) {
1139e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1146b92dc4bSJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1159e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1169e201c85SYohann     } else if (BASIS_DIM == 3) {
1172b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
1182b730f8bSJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1196b92dc4bSJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1202b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
1212b730f8bSJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1229e201c85SYohann     }
1239e201c85SYohann   }
1249e201c85SYohann }
1259e201c85SYohann 
126db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
127*02219a08SJeremy L Thompson     void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
128*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
129*02219a08SJeremy L Thompson 
130*02219a08SJeremy L Thompson   SharedData_Hip data;
131*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
132*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
133*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
134*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
135*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
136*02219a08SJeremy L Thompson 
137*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
138*02219a08SJeremy L Thompson 
139*02219a08SJeremy L Thompson   // Apply basis element by element
140*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
141*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
142*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
143*02219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
144*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
145*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
146*02219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
147*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
148*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
149*02219a08SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
150*02219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
151*02219a08SJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
152*02219a08SJeremy L Thompson     }
153*02219a08SJeremy L Thompson   }
154*02219a08SJeremy L Thompson }
155*02219a08SJeremy L Thompson 
156*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
157aa4002adSJeremy L Thompson     void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
158db2becc9SJeremy L Thompson   extern __shared__ CeedScalar slice[];
159db2becc9SJeremy L Thompson 
160db2becc9SJeremy L Thompson   SharedData_Hip data;
161db2becc9SJeremy L Thompson   data.t_id_x = threadIdx.x;
162db2becc9SJeremy L Thompson   data.t_id_y = threadIdx.y;
163db2becc9SJeremy L Thompson   data.t_id_z = threadIdx.z;
164db2becc9SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
1656b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
166db2becc9SJeremy L Thompson 
167db2becc9SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
168db2becc9SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
169db2becc9SJeremy L Thompson 
170aa4002adSJeremy L Thompson   // load interp_1d into shared memory
171aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
172aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
173aa4002adSJeremy L Thompson   __syncthreads();
174aa4002adSJeremy L Thompson 
175aa4002adSJeremy L Thompson   // Apply basis element by element
176db2becc9SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
177db2becc9SJeremy L Thompson     if (BASIS_DIM == 1) {
178db2becc9SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1796b92dc4bSJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
180db2becc9SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
181db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 2) {
182db2becc9SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1836b92dc4bSJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
184db2becc9SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
185db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 3) {
186db2becc9SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
187db2becc9SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1886b92dc4bSJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
189db2becc9SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
190db2becc9SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
191db2becc9SJeremy L Thompson     }
192db2becc9SJeremy L Thompson   }
193db2becc9SJeremy L Thompson }
194db2becc9SJeremy L Thompson 
195*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
196*02219a08SJeremy L Thompson     void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
197*02219a08SJeremy L Thompson                                       CeedScalar *__restrict__ d_V) {
198*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
199*02219a08SJeremy L Thompson 
200*02219a08SJeremy L Thompson   SharedData_Hip data;
201*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
202*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
203*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
204*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
205*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
206*02219a08SJeremy L Thompson 
207*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
208*02219a08SJeremy L Thompson 
209*02219a08SJeremy L Thompson   // Apply basis element by element
210*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
211*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
212*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
213*02219a08SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
214*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
215*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
216*02219a08SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
217*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
218*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
219*02219a08SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
220*02219a08SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
221*02219a08SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
222*02219a08SJeremy L Thompson     }
223*02219a08SJeremy L Thompson   }
224*02219a08SJeremy L Thompson }
225*02219a08SJeremy L Thompson 
2269e201c85SYohann //------------------------------------------------------------------------------
2279e201c85SYohann // Grad kernel by dim
2289e201c85SYohann //------------------------------------------------------------------------------
229aa4002adSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
230aa4002adSJeremy L Thompson                                                                          const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
2319e201c85SYohann   extern __shared__ CeedScalar slice[];
232b2165e7aSSebastian Grimberg 
2339e201c85SYohann   SharedData_Hip data;
2349e201c85SYohann   data.t_id_x = threadIdx.x;
2359e201c85SYohann   data.t_id_y = threadIdx.y;
2369e201c85SYohann   data.t_id_z = threadIdx.z;
2379e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
2386b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
2399e201c85SYohann 
2409e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
2419e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
2429e201c85SYohann 
243aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
244aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
245aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
246aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
247aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
248aa4002adSJeremy L Thompson   __syncthreads();
249aa4002adSJeremy L Thompson 
250aa4002adSJeremy L Thompson   // Apply basis element by element
2519e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
2529e201c85SYohann     if (BASIS_DIM == 1) {
2539e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
2546b92dc4bSJeremy L Thompson       Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2559e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
2569e201c85SYohann     } else if (BASIS_DIM == 2) {
2579e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2586b92dc4bSJeremy L Thompson       GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2592b730f8bSJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
2602b730f8bSJeremy L Thompson                                                                     d_V);
2619e201c85SYohann     } else if (BASIS_DIM == 3) {
2622b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
2632b730f8bSJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2646b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2656b92dc4bSJeremy L Thompson       else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2662b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
2672b730f8bSJeremy L Thompson                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
2689e201c85SYohann     }
2699e201c85SYohann   }
2709e201c85SYohann }
2719e201c85SYohann 
2722b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
273*02219a08SJeremy L Thompson     void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
274*02219a08SJeremy L Thompson                         CeedScalar *__restrict__ d_V) {
275*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
276*02219a08SJeremy L Thompson 
277*02219a08SJeremy L Thompson   SharedData_Hip data;
278*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
279*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
280*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
281*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
282*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
283*02219a08SJeremy L Thompson 
284*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
285*02219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
286*02219a08SJeremy L Thompson 
287*02219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
288*02219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
289*02219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
290*02219a08SJeremy L Thompson   __syncthreads();
291*02219a08SJeremy L Thompson 
292*02219a08SJeremy L Thompson   // Apply basis element by element
293*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
294*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
295*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
296*02219a08SJeremy L Thompson       Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
297*02219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
298*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
299*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
300*02219a08SJeremy L Thompson       GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
301*02219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
302*02219a08SJeremy L Thompson                                                                     d_V);
303*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
304*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
305*02219a08SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
306*02219a08SJeremy L Thompson       GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
307*02219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
308*02219a08SJeremy L Thompson                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
309*02219a08SJeremy L Thompson     }
310*02219a08SJeremy L Thompson   }
311*02219a08SJeremy L Thompson }
312*02219a08SJeremy L Thompson 
313*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
314aa4002adSJeremy L Thompson     void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
3159e201c85SYohann                        CeedScalar *__restrict__ d_V) {
3169e201c85SYohann   extern __shared__ CeedScalar slice[];
317b2165e7aSSebastian Grimberg 
3189e201c85SYohann   SharedData_Hip data;
3199e201c85SYohann   data.t_id_x = threadIdx.x;
3209e201c85SYohann   data.t_id_y = threadIdx.y;
3219e201c85SYohann   data.t_id_z = threadIdx.z;
3229e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
3236b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
3249e201c85SYohann 
3259e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
3269e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
3279e201c85SYohann 
328aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
329aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
330aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
331aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
332aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
333aa4002adSJeremy L Thompson   __syncthreads();
334aa4002adSJeremy L Thompson 
335aa4002adSJeremy L Thompson   // Apply basis element by element
3369e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
3379e201c85SYohann     if (BASIS_DIM == 1) {
3389e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
3396b92dc4bSJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3409e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
3419e201c85SYohann     } else if (BASIS_DIM == 2) {
3422b730f8bSJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
3432b730f8bSJeremy L Thompson                                                                    r_U);
3446b92dc4bSJeremy L Thompson       GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3459e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3469e201c85SYohann     } else if (BASIS_DIM == 3) {
3472b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
3482b730f8bSJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
3496b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3506b92dc4bSJeremy L Thompson       else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3512b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
3522b730f8bSJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3539e201c85SYohann     }
3549e201c85SYohann   }
3559e201c85SYohann }
3569e201c85SYohann 
357db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
358*02219a08SJeremy L Thompson     void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
359*02219a08SJeremy L Thompson                                  CeedScalar *__restrict__ d_V) {
360*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
361*02219a08SJeremy L Thompson 
362*02219a08SJeremy L Thompson   SharedData_Hip data;
363*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
364*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
365*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
366*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
367*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
368*02219a08SJeremy L Thompson 
369*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
370*02219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
371*02219a08SJeremy L Thompson 
372*02219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
373*02219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
374*02219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
375*02219a08SJeremy L Thompson   __syncthreads();
376*02219a08SJeremy L Thompson 
377*02219a08SJeremy L Thompson   // Apply basis element by element
378*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
379*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
380*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
381*02219a08SJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
382*02219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
383*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
384*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
385*02219a08SJeremy L Thompson                                                                    r_U);
386*02219a08SJeremy L Thompson       GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
387*02219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
388*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
389*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
390*02219a08SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
391*02219a08SJeremy L Thompson       GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
392*02219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
393*02219a08SJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
394*02219a08SJeremy L Thompson     }
395*02219a08SJeremy L Thompson   }
396*02219a08SJeremy L Thompson }
397*02219a08SJeremy L Thompson 
398*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
399aa4002adSJeremy L Thompson     void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
400db2becc9SJeremy L Thompson                           CeedScalar *__restrict__ d_V) {
401db2becc9SJeremy L Thompson   extern __shared__ CeedScalar slice[];
402db2becc9SJeremy L Thompson 
403db2becc9SJeremy L Thompson   SharedData_Hip data;
404db2becc9SJeremy L Thompson   data.t_id_x = threadIdx.x;
405db2becc9SJeremy L Thompson   data.t_id_y = threadIdx.y;
406db2becc9SJeremy L Thompson   data.t_id_z = threadIdx.z;
407db2becc9SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4086b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
409db2becc9SJeremy L Thompson 
410db2becc9SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
411db2becc9SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
412db2becc9SJeremy L Thompson 
413aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
414aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
415aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
416aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
417aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
418aa4002adSJeremy L Thompson   __syncthreads();
419aa4002adSJeremy L Thompson 
420aa4002adSJeremy L Thompson   // Apply basis element by element
421db2becc9SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
422db2becc9SJeremy L Thompson     if (BASIS_DIM == 1) {
423db2becc9SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
4246b92dc4bSJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
425db2becc9SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
426db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 2) {
427db2becc9SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
428db2becc9SJeremy L Thompson                                                                    r_U);
4296b92dc4bSJeremy L Thompson       GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
430db2becc9SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
431db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 3) {
432db2becc9SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
433db2becc9SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
4346b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
4356b92dc4bSJeremy L Thompson       else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
436db2becc9SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
437db2becc9SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
438db2becc9SJeremy L Thompson     }
439db2becc9SJeremy L Thompson   }
440db2becc9SJeremy L Thompson }
441db2becc9SJeremy L Thompson 
442*02219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
443*02219a08SJeremy L Thompson     void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
444*02219a08SJeremy L Thompson                                     CeedScalar *__restrict__ d_V) {
445*02219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
446*02219a08SJeremy L Thompson 
447*02219a08SJeremy L Thompson   SharedData_Hip data;
448*02219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
449*02219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
450*02219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
451*02219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
452*02219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
453*02219a08SJeremy L Thompson 
454*02219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
455*02219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
456*02219a08SJeremy L Thompson 
457*02219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
458*02219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
459*02219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
460*02219a08SJeremy L Thompson   __syncthreads();
461*02219a08SJeremy L Thompson 
462*02219a08SJeremy L Thompson   // Apply basis element by element
463*02219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
464*02219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
465*02219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
466*02219a08SJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
467*02219a08SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
468*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
469*02219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
470*02219a08SJeremy L Thompson                                                                    r_U);
471*02219a08SJeremy L Thompson       GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
472*02219a08SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
473*02219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
474*02219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
475*02219a08SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
476*02219a08SJeremy L Thompson       GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_G, r_V);
477*02219a08SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
478*02219a08SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
479*02219a08SJeremy L Thompson     }
480*02219a08SJeremy L Thompson   }
481*02219a08SJeremy L Thompson }
482*02219a08SJeremy L Thompson 
4839e201c85SYohann //------------------------------------------------------------------------------
4849e201c85SYohann // Weight kernels by dim
4859e201c85SYohann //------------------------------------------------------------------------------
4862b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
4872b730f8bSJeremy L Thompson     void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
4889e201c85SYohann   extern __shared__ CeedScalar slice[];
4899e201c85SYohann 
4909e201c85SYohann   SharedData_Hip data;
4919e201c85SYohann   data.t_id_x = threadIdx.x;
4929e201c85SYohann   data.t_id_y = threadIdx.y;
4939e201c85SYohann   data.t_id_z = threadIdx.z;
4949e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4956b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
4969e201c85SYohann 
4979e201c85SYohann   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
4989e201c85SYohann 
4999e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
5009e201c85SYohann     if (BASIS_DIM == 1) {
501ca595be6SJeremy L Thompson       Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5029e201c85SYohann       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
5039e201c85SYohann     } else if (BASIS_DIM == 2) {
504ca595be6SJeremy L Thompson       WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5059e201c85SYohann       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
5069e201c85SYohann     } else if (BASIS_DIM == 3) {
507ca595be6SJeremy L Thompson       WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5082b730f8bSJeremy L Thompson       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
5092b730f8bSJeremy L Thompson                                            d_W);
5109e201c85SYohann     }
5119e201c85SYohann   }
5129e201c85SYohann }
513