xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED:  http://github.com/ceed
79e201c85SYohann 
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP shared memory tensor product basis
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
112b730f8bSJeremy L Thompson 
129e201c85SYohann #include "hip-shared-basis-read-write-templates.h"
139e201c85SYohann #include "hip-shared-basis-tensor-templates.h"
149e201c85SYohann 
159e201c85SYohann //------------------------------------------------------------------------------
169e201c85SYohann // Interp kernel by dim
179e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)182b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
19aa4002adSJeremy L Thompson     void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
209e201c85SYohann   extern __shared__ CeedScalar slice[];
21b2165e7aSSebastian Grimberg 
229e201c85SYohann   SharedData_Hip data;
239e201c85SYohann   data.t_id_x = threadIdx.x;
249e201c85SYohann   data.t_id_y = threadIdx.y;
259e201c85SYohann   data.t_id_z = threadIdx.z;
269e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
276b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
289e201c85SYohann 
299e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
309e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
319e201c85SYohann 
32aa4002adSJeremy L Thompson   // load interp_1d into shared memory
33aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
34aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
35aa4002adSJeremy L Thompson   __syncthreads();
36aa4002adSJeremy L Thompson 
37aa4002adSJeremy L Thompson   // Apply basis element by element
389e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
399e201c85SYohann     if (BASIS_DIM == 1) {
409e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
416b92dc4bSJeremy L Thompson       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
429e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
439e201c85SYohann     } else if (BASIS_DIM == 2) {
449e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
456b92dc4bSJeremy L Thompson       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
469e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
479e201c85SYohann     } else if (BASIS_DIM == 3) {
482b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
492b730f8bSJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
506b92dc4bSJeremy L Thompson       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
512b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
522b730f8bSJeremy L Thompson                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
539e201c85SYohann     }
549e201c85SYohann   }
559e201c85SYohann }
569e201c85SYohann 
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)572b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
5802219a08SJeremy L Thompson     void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
5902219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
6002219a08SJeremy L Thompson 
6102219a08SJeremy L Thompson   SharedData_Hip data;
6202219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
6302219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
6402219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
6502219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
6602219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
6702219a08SJeremy L Thompson 
6802219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
6902219a08SJeremy L Thompson 
7002219a08SJeremy L Thompson   // Apply basis element by element
7102219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
7202219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
7302219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
7402219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
7502219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
7602219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
7702219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
7802219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
7902219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
8002219a08SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
8102219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
8202219a08SJeremy L Thompson                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
8302219a08SJeremy L Thompson     }
8402219a08SJeremy L Thompson   }
8502219a08SJeremy L Thompson }
8602219a08SJeremy L Thompson 
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)8702219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
88aa4002adSJeremy L Thompson     void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
899e201c85SYohann   extern __shared__ CeedScalar slice[];
90b2165e7aSSebastian Grimberg 
919e201c85SYohann   SharedData_Hip data;
929e201c85SYohann   data.t_id_x = threadIdx.x;
939e201c85SYohann   data.t_id_y = threadIdx.y;
949e201c85SYohann   data.t_id_z = threadIdx.z;
959e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
966b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
979e201c85SYohann 
989e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
999e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
1009e201c85SYohann 
101aa4002adSJeremy L Thompson   // load interp_1d into shared memory
102aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
103aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
104aa4002adSJeremy L Thompson   __syncthreads();
105aa4002adSJeremy L Thompson 
106aa4002adSJeremy L Thompson   // Apply basis element by element
1079e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1089e201c85SYohann     if (BASIS_DIM == 1) {
1099e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1106b92dc4bSJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1119e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
1129e201c85SYohann     } else if (BASIS_DIM == 2) {
1139e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1146b92dc4bSJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1159e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1169e201c85SYohann     } else if (BASIS_DIM == 3) {
1172b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
1182b730f8bSJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1196b92dc4bSJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1202b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
1212b730f8bSJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1229e201c85SYohann     }
1239e201c85SYohann   }
1249e201c85SYohann }
1259e201c85SYohann 
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)126db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
12702219a08SJeremy L Thompson     void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
12802219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
12902219a08SJeremy L Thompson 
13002219a08SJeremy L Thompson   SharedData_Hip data;
13102219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
13202219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
13302219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
13402219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
13502219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
13602219a08SJeremy L Thompson 
13702219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
13802219a08SJeremy L Thompson 
13902219a08SJeremy L Thompson   // Apply basis element by element
14002219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
14102219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
14202219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
14302219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
14402219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
14502219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
14602219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
14702219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
14802219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
14902219a08SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
15002219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
15102219a08SJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
15202219a08SJeremy L Thompson     }
15302219a08SJeremy L Thompson   }
15402219a08SJeremy L Thompson }
15502219a08SJeremy L Thompson 
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)15602219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
157aa4002adSJeremy L Thompson     void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
158db2becc9SJeremy L Thompson   extern __shared__ CeedScalar slice[];
159db2becc9SJeremy L Thompson 
160db2becc9SJeremy L Thompson   SharedData_Hip data;
161db2becc9SJeremy L Thompson   data.t_id_x = threadIdx.x;
162db2becc9SJeremy L Thompson   data.t_id_y = threadIdx.y;
163db2becc9SJeremy L Thompson   data.t_id_z = threadIdx.z;
164db2becc9SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
1656b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
166db2becc9SJeremy L Thompson 
167db2becc9SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
168db2becc9SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
169db2becc9SJeremy L Thompson 
170aa4002adSJeremy L Thompson   // load interp_1d into shared memory
171aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
172aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
173aa4002adSJeremy L Thompson   __syncthreads();
174aa4002adSJeremy L Thompson 
175aa4002adSJeremy L Thompson   // Apply basis element by element
176db2becc9SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
177db2becc9SJeremy L Thompson     if (BASIS_DIM == 1) {
178db2becc9SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1796b92dc4bSJeremy L Thompson       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
180db2becc9SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
181db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 2) {
182db2becc9SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1836b92dc4bSJeremy L Thompson       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
184db2becc9SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
185db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 3) {
186db2becc9SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
187db2becc9SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1886b92dc4bSJeremy L Thompson       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
189db2becc9SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
190db2becc9SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
191db2becc9SJeremy L Thompson     }
192db2becc9SJeremy L Thompson   }
193db2becc9SJeremy L Thompson }
194db2becc9SJeremy L Thompson 
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)19502219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
19602219a08SJeremy L Thompson     void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
19702219a08SJeremy L Thompson                                       CeedScalar *__restrict__ d_V) {
19802219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
19902219a08SJeremy L Thompson 
20002219a08SJeremy L Thompson   SharedData_Hip data;
20102219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
20202219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
20302219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
20402219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
20502219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
20602219a08SJeremy L Thompson 
20702219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
20802219a08SJeremy L Thompson 
20902219a08SJeremy L Thompson   // Apply basis element by element
21002219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
21102219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
21202219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
21302219a08SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
21402219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
21502219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
21602219a08SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
21702219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
21802219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
21902219a08SJeremy L Thompson                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
22002219a08SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
22102219a08SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
22202219a08SJeremy L Thompson     }
22302219a08SJeremy L Thompson   }
22402219a08SJeremy L Thompson }
22502219a08SJeremy L Thompson 
2269e201c85SYohann //------------------------------------------------------------------------------
2279e201c85SYohann // Grad kernel by dim
2289e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)229aa4002adSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
230aa4002adSJeremy L Thompson                                                                          const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
2319e201c85SYohann   extern __shared__ CeedScalar slice[];
232b2165e7aSSebastian Grimberg 
2339e201c85SYohann   SharedData_Hip data;
2349e201c85SYohann   data.t_id_x = threadIdx.x;
2359e201c85SYohann   data.t_id_y = threadIdx.y;
2369e201c85SYohann   data.t_id_z = threadIdx.z;
2379e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
2386b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
2399e201c85SYohann 
2409e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
2419e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
2429e201c85SYohann 
243aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
244aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
245aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
246aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
247aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
248aa4002adSJeremy L Thompson   __syncthreads();
249aa4002adSJeremy L Thompson 
250aa4002adSJeremy L Thompson   // Apply basis element by element
2519e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
2529e201c85SYohann     if (BASIS_DIM == 1) {
2539e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
2546b92dc4bSJeremy L Thompson       Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2559e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
2569e201c85SYohann     } else if (BASIS_DIM == 2) {
2579e201c85SYohann       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2586b92dc4bSJeremy L Thompson       GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2592b730f8bSJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
2602b730f8bSJeremy L Thompson                                                                     d_V);
2619e201c85SYohann     } else if (BASIS_DIM == 3) {
2622b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
2632b730f8bSJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2646b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2656b92dc4bSJeremy L Thompson       else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2662b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
2672b730f8bSJeremy L Thompson                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
2689e201c85SYohann     }
2699e201c85SYohann   }
2709e201c85SYohann }
2719e201c85SYohann 
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)2722b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
27302219a08SJeremy L Thompson     void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
27402219a08SJeremy L Thompson                         CeedScalar *__restrict__ d_V) {
27502219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
27602219a08SJeremy L Thompson 
27702219a08SJeremy L Thompson   SharedData_Hip data;
27802219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
27902219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
28002219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
28102219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
28202219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
28302219a08SJeremy L Thompson 
28402219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
28502219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
28602219a08SJeremy L Thompson 
28702219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
28802219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
28902219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
29002219a08SJeremy L Thompson   __syncthreads();
29102219a08SJeremy L Thompson 
29202219a08SJeremy L Thompson   // Apply basis element by element
29302219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
29402219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
29502219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
29602219a08SJeremy L Thompson       Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
29702219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
29802219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
29902219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
3000ccda8ebSJeremy L Thompson       GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
30102219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
30202219a08SJeremy L Thompson                                                                     d_V);
30302219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
30402219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
30502219a08SJeremy L Thompson                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
3060ccda8ebSJeremy L Thompson       GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
30702219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
30802219a08SJeremy L Thompson                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
30902219a08SJeremy L Thompson     }
31002219a08SJeremy L Thompson   }
31102219a08SJeremy L Thompson }
31202219a08SJeremy L Thompson 
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)31302219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
314aa4002adSJeremy L Thompson     void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
3159e201c85SYohann                        CeedScalar *__restrict__ d_V) {
3169e201c85SYohann   extern __shared__ CeedScalar slice[];
317b2165e7aSSebastian Grimberg 
3189e201c85SYohann   SharedData_Hip data;
3199e201c85SYohann   data.t_id_x = threadIdx.x;
3209e201c85SYohann   data.t_id_y = threadIdx.y;
3219e201c85SYohann   data.t_id_z = threadIdx.z;
3229e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
3236b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
3249e201c85SYohann 
3259e201c85SYohann   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
3269e201c85SYohann   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
3279e201c85SYohann 
328aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
329aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
330aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
331aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
332aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
333aa4002adSJeremy L Thompson   __syncthreads();
334aa4002adSJeremy L Thompson 
335aa4002adSJeremy L Thompson   // Apply basis element by element
3369e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
3379e201c85SYohann     if (BASIS_DIM == 1) {
3389e201c85SYohann       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
3396b92dc4bSJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3409e201c85SYohann       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
3419e201c85SYohann     } else if (BASIS_DIM == 2) {
3422b730f8bSJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
3432b730f8bSJeremy L Thompson                                                                    r_U);
3446b92dc4bSJeremy L Thompson       GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3459e201c85SYohann       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3469e201c85SYohann     } else if (BASIS_DIM == 3) {
3472b730f8bSJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
3482b730f8bSJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
3496b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3506b92dc4bSJeremy L Thompson       else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3512b730f8bSJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
3522b730f8bSJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3539e201c85SYohann     }
3549e201c85SYohann   }
3559e201c85SYohann }
3569e201c85SYohann 
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)357db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
35802219a08SJeremy L Thompson     void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
35902219a08SJeremy L Thompson                                  CeedScalar *__restrict__ d_V) {
36002219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
36102219a08SJeremy L Thompson 
36202219a08SJeremy L Thompson   SharedData_Hip data;
36302219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
36402219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
36502219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
36602219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
36702219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
36802219a08SJeremy L Thompson 
36902219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
37002219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
37102219a08SJeremy L Thompson 
37202219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
37302219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
37402219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
37502219a08SJeremy L Thompson   __syncthreads();
37602219a08SJeremy L Thompson 
37702219a08SJeremy L Thompson   // Apply basis element by element
37802219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
37902219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
38002219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
38102219a08SJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
38202219a08SJeremy L Thompson       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
38302219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
38402219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
38502219a08SJeremy L Thompson                                                                    r_U);
3860ccda8ebSJeremy L Thompson       GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
38702219a08SJeremy L Thompson       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
38802219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
38902219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
39002219a08SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
3910ccda8ebSJeremy L Thompson       GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
39202219a08SJeremy L Thompson       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
39302219a08SJeremy L Thompson                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
39402219a08SJeremy L Thompson     }
39502219a08SJeremy L Thompson   }
39602219a08SJeremy L Thompson }
39702219a08SJeremy L Thompson 
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)39802219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
399aa4002adSJeremy L Thompson     void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
400db2becc9SJeremy L Thompson                           CeedScalar *__restrict__ d_V) {
401db2becc9SJeremy L Thompson   extern __shared__ CeedScalar slice[];
402db2becc9SJeremy L Thompson 
403db2becc9SJeremy L Thompson   SharedData_Hip data;
404db2becc9SJeremy L Thompson   data.t_id_x = threadIdx.x;
405db2becc9SJeremy L Thompson   data.t_id_y = threadIdx.y;
406db2becc9SJeremy L Thompson   data.t_id_z = threadIdx.z;
407db2becc9SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4086b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
409db2becc9SJeremy L Thompson 
410db2becc9SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
411db2becc9SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
412db2becc9SJeremy L Thompson 
413aa4002adSJeremy L Thompson   // load interp_1d and grad_1d into shared memory
414aa4002adSJeremy L Thompson   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
415aa4002adSJeremy L Thompson   LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
416aa4002adSJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
417aa4002adSJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
418aa4002adSJeremy L Thompson   __syncthreads();
419aa4002adSJeremy L Thompson 
420aa4002adSJeremy L Thompson   // Apply basis element by element
421db2becc9SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
422db2becc9SJeremy L Thompson     if (BASIS_DIM == 1) {
423db2becc9SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
4246b92dc4bSJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
425db2becc9SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
426db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 2) {
427db2becc9SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
428db2becc9SJeremy L Thompson                                                                    r_U);
4296b92dc4bSJeremy L Thompson       GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
430db2becc9SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
431db2becc9SJeremy L Thompson     } else if (BASIS_DIM == 3) {
432db2becc9SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
433db2becc9SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
4346b92dc4bSJeremy L Thompson       if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
4356b92dc4bSJeremy L Thompson       else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
436db2becc9SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
437db2becc9SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
438db2becc9SJeremy L Thompson     }
439db2becc9SJeremy L Thompson   }
440db2becc9SJeremy L Thompson }
441db2becc9SJeremy L Thompson 
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)44202219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
44302219a08SJeremy L Thompson     void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
44402219a08SJeremy L Thompson                                     CeedScalar *__restrict__ d_V) {
44502219a08SJeremy L Thompson   extern __shared__ CeedScalar slice[];
44602219a08SJeremy L Thompson 
44702219a08SJeremy L Thompson   SharedData_Hip data;
44802219a08SJeremy L Thompson   data.t_id_x = threadIdx.x;
44902219a08SJeremy L Thompson   data.t_id_y = threadIdx.y;
45002219a08SJeremy L Thompson   data.t_id_z = threadIdx.z;
45102219a08SJeremy L Thompson   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
45202219a08SJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
45302219a08SJeremy L Thompson 
45402219a08SJeremy L Thompson   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
45502219a08SJeremy L Thompson   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
45602219a08SJeremy L Thompson 
45702219a08SJeremy L Thompson   // load interp_1d and grad_1d into shared memory
45802219a08SJeremy L Thompson   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
45902219a08SJeremy L Thompson   LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
46002219a08SJeremy L Thompson   __syncthreads();
46102219a08SJeremy L Thompson 
46202219a08SJeremy L Thompson   // Apply basis element by element
46302219a08SJeremy L Thompson   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46402219a08SJeremy L Thompson     if (BASIS_DIM == 1) {
46502219a08SJeremy L Thompson       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
46602219a08SJeremy L Thompson       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
46702219a08SJeremy L Thompson       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
46802219a08SJeremy L Thompson     } else if (BASIS_DIM == 2) {
46902219a08SJeremy L Thompson       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
47002219a08SJeremy L Thompson                                                                    r_U);
4710ccda8ebSJeremy L Thompson       GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
47202219a08SJeremy L Thompson       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
47302219a08SJeremy L Thompson     } else if (BASIS_DIM == 3) {
47402219a08SJeremy L Thompson       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
47502219a08SJeremy L Thompson                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
4760ccda8ebSJeremy L Thompson       GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
47702219a08SJeremy L Thompson       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
47802219a08SJeremy L Thompson                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
47902219a08SJeremy L Thompson     }
48002219a08SJeremy L Thompson   }
48102219a08SJeremy L Thompson }
48202219a08SJeremy L Thompson 
4839e201c85SYohann //------------------------------------------------------------------------------
4849e201c85SYohann // Weight kernels by dim
4859e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE)4862b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
4872b730f8bSJeremy L Thompson     void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
4889e201c85SYohann   extern __shared__ CeedScalar slice[];
4899e201c85SYohann 
4909e201c85SYohann   SharedData_Hip data;
4919e201c85SYohann   data.t_id_x = threadIdx.x;
4929e201c85SYohann   data.t_id_y = threadIdx.y;
4939e201c85SYohann   data.t_id_z = threadIdx.z;
4949e201c85SYohann   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4956b92dc4bSJeremy L Thompson   data.slice  = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
4969e201c85SYohann 
4979e201c85SYohann   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
4989e201c85SYohann 
4999e201c85SYohann   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
5009e201c85SYohann     if (BASIS_DIM == 1) {
501ca595be6SJeremy L Thompson       Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5029e201c85SYohann       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
5039e201c85SYohann     } else if (BASIS_DIM == 2) {
504ca595be6SJeremy L Thompson       WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5059e201c85SYohann       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
5069e201c85SYohann     } else if (BASIS_DIM == 3) {
507ca595be6SJeremy L Thompson       WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5082b730f8bSJeremy L Thompson       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
5092b730f8bSJeremy L Thompson                                            d_W);
5109e201c85SYohann     }
5119e201c85SYohann   }
5129e201c85SYohann }
513