1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39e201c85SYohann //
49e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause
59e201c85SYohann //
69e201c85SYohann // This file is part of CEED: http://github.com/ceed
79e201c85SYohann
89e201c85SYohann /// @file
99e201c85SYohann /// Internal header for HIP shared memory tensor product basis
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
112b730f8bSJeremy L Thompson
129e201c85SYohann #include "hip-shared-basis-read-write-templates.h"
139e201c85SYohann #include "hip-shared-basis-tensor-templates.h"
149e201c85SYohann
159e201c85SYohann //------------------------------------------------------------------------------
169e201c85SYohann // Interp kernel by dim
179e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)182b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
19aa4002adSJeremy L Thompson void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
209e201c85SYohann extern __shared__ CeedScalar slice[];
21b2165e7aSSebastian Grimberg
229e201c85SYohann SharedData_Hip data;
239e201c85SYohann data.t_id_x = threadIdx.x;
249e201c85SYohann data.t_id_y = threadIdx.y;
259e201c85SYohann data.t_id_z = threadIdx.z;
269e201c85SYohann data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
276b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
289e201c85SYohann
299e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
309e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
319e201c85SYohann
32aa4002adSJeremy L Thompson // load interp_1d into shared memory
33aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
34aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
35aa4002adSJeremy L Thompson __syncthreads();
36aa4002adSJeremy L Thompson
37aa4002adSJeremy L Thompson // Apply basis element by element
389e201c85SYohann for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
399e201c85SYohann if (BASIS_DIM == 1) {
409e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
416b92dc4bSJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
429e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
439e201c85SYohann } else if (BASIS_DIM == 2) {
449e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
456b92dc4bSJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
469e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
479e201c85SYohann } else if (BASIS_DIM == 3) {
482b730f8bSJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
492b730f8bSJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
506b92dc4bSJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
512b730f8bSJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
522b730f8bSJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
539e201c85SYohann }
549e201c85SYohann }
559e201c85SYohann }
569e201c85SYohann
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)572b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
5802219a08SJeremy L Thompson void InterpCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
5902219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
6002219a08SJeremy L Thompson
6102219a08SJeremy L Thompson SharedData_Hip data;
6202219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
6302219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
6402219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
6502219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
6602219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
6702219a08SJeremy L Thompson
6802219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
6902219a08SJeremy L Thompson
7002219a08SJeremy L Thompson // Apply basis element by element
7102219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
7202219a08SJeremy L Thompson if (BASIS_DIM == 1) {
7302219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
7402219a08SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_U, d_V);
7502219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
7602219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
7702219a08SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
7802219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
7902219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
8002219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
8102219a08SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
8202219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_U, d_V);
8302219a08SJeremy L Thompson }
8402219a08SJeremy L Thompson }
8502219a08SJeremy L Thompson }
8602219a08SJeremy L Thompson
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)8702219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
88aa4002adSJeremy L Thompson void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
899e201c85SYohann extern __shared__ CeedScalar slice[];
90b2165e7aSSebastian Grimberg
919e201c85SYohann SharedData_Hip data;
929e201c85SYohann data.t_id_x = threadIdx.x;
939e201c85SYohann data.t_id_y = threadIdx.y;
949e201c85SYohann data.t_id_z = threadIdx.z;
959e201c85SYohann data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
966b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
979e201c85SYohann
989e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
999e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
1009e201c85SYohann
101aa4002adSJeremy L Thompson // load interp_1d into shared memory
102aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
103aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
104aa4002adSJeremy L Thompson __syncthreads();
105aa4002adSJeremy L Thompson
106aa4002adSJeremy L Thompson // Apply basis element by element
1079e201c85SYohann for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
1089e201c85SYohann if (BASIS_DIM == 1) {
1099e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1106b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1119e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
1129e201c85SYohann } else if (BASIS_DIM == 2) {
1139e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1146b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1159e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1169e201c85SYohann } else if (BASIS_DIM == 3) {
1172b730f8bSJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
1182b730f8bSJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1196b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
1202b730f8bSJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
1212b730f8bSJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
1229e201c85SYohann }
1239e201c85SYohann }
1249e201c85SYohann }
1259e201c85SYohann
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)126db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
12702219a08SJeremy L Thompson void InterpCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
12802219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
12902219a08SJeremy L Thompson
13002219a08SJeremy L Thompson SharedData_Hip data;
13102219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
13202219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
13302219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
13402219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
13502219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
13602219a08SJeremy L Thompson
13702219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
13802219a08SJeremy L Thompson
13902219a08SJeremy L Thompson // Apply basis element by element
14002219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
14102219a08SJeremy L Thompson if (BASIS_DIM == 1) {
14202219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
14302219a08SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
14402219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
14502219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
14602219a08SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
14702219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
14802219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
14902219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
15002219a08SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
15102219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
15202219a08SJeremy L Thompson }
15302219a08SJeremy L Thompson }
15402219a08SJeremy L Thompson }
15502219a08SJeremy L Thompson
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)15602219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
157aa4002adSJeremy L Thompson void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
158db2becc9SJeremy L Thompson extern __shared__ CeedScalar slice[];
159db2becc9SJeremy L Thompson
160db2becc9SJeremy L Thompson SharedData_Hip data;
161db2becc9SJeremy L Thompson data.t_id_x = threadIdx.x;
162db2becc9SJeremy L Thompson data.t_id_y = threadIdx.y;
163db2becc9SJeremy L Thompson data.t_id_z = threadIdx.z;
164db2becc9SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
1656b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
166db2becc9SJeremy L Thompson
167db2becc9SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
168db2becc9SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
169db2becc9SJeremy L Thompson
170aa4002adSJeremy L Thompson // load interp_1d into shared memory
171aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
172aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
173aa4002adSJeremy L Thompson __syncthreads();
174aa4002adSJeremy L Thompson
175aa4002adSJeremy L Thompson // Apply basis element by element
176db2becc9SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
177db2becc9SJeremy L Thompson if (BASIS_DIM == 1) {
178db2becc9SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
1796b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
180db2becc9SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
181db2becc9SJeremy L Thompson } else if (BASIS_DIM == 2) {
182db2becc9SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1836b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
184db2becc9SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
185db2becc9SJeremy L Thompson } else if (BASIS_DIM == 3) {
186db2becc9SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
187db2becc9SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
1886b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V);
189db2becc9SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
190db2becc9SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
191db2becc9SJeremy L Thompson }
192db2becc9SJeremy L Thompson }
193db2becc9SJeremy L Thompson }
194db2becc9SJeremy L Thompson
__launch_bounds__(BASIS_INTERP_BLOCK_SIZE)19502219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
19602219a08SJeremy L Thompson void InterpCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
19702219a08SJeremy L Thompson CeedScalar *__restrict__ d_V) {
19802219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
19902219a08SJeremy L Thompson
20002219a08SJeremy L Thompson SharedData_Hip data;
20102219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
20202219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
20302219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
20402219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
20502219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
20602219a08SJeremy L Thompson
20702219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
20802219a08SJeremy L Thompson
20902219a08SJeremy L Thompson // Apply basis element by element
21002219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
21102219a08SJeremy L Thompson if (BASIS_DIM == 1) {
21202219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
21302219a08SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_U, d_V);
21402219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
21502219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
21602219a08SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_U, d_V);
21702219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
21802219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
21902219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
22002219a08SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
22102219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_U, d_V);
22202219a08SJeremy L Thompson }
22302219a08SJeremy L Thompson }
22402219a08SJeremy L Thompson }
22502219a08SJeremy L Thompson
2269e201c85SYohann //------------------------------------------------------------------------------
2279e201c85SYohann // Grad kernel by dim
2289e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)229aa4002adSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G,
230aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
2319e201c85SYohann extern __shared__ CeedScalar slice[];
232b2165e7aSSebastian Grimberg
2339e201c85SYohann SharedData_Hip data;
2349e201c85SYohann data.t_id_x = threadIdx.x;
2359e201c85SYohann data.t_id_y = threadIdx.y;
2369e201c85SYohann data.t_id_z = threadIdx.z;
2379e201c85SYohann data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
2386b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
2399e201c85SYohann
2409e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
2419e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
2429e201c85SYohann
243aa4002adSJeremy L Thompson // load interp_1d and grad_1d into shared memory
244aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
245aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
246aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
247aa4002adSJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
248aa4002adSJeremy L Thompson __syncthreads();
249aa4002adSJeremy L Thompson
250aa4002adSJeremy L Thompson // Apply basis element by element
2519e201c85SYohann for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
2529e201c85SYohann if (BASIS_DIM == 1) {
2539e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
2546b92dc4bSJeremy L Thompson Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2559e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
2569e201c85SYohann } else if (BASIS_DIM == 2) {
2579e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2586b92dc4bSJeremy L Thompson GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2592b730f8bSJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
2602b730f8bSJeremy L Thompson d_V);
2619e201c85SYohann } else if (BASIS_DIM == 3) {
2622b730f8bSJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
2632b730f8bSJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
2646b92dc4bSJeremy L Thompson if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2656b92dc4bSJeremy L Thompson else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
2662b730f8bSJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
2672b730f8bSJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
2689e201c85SYohann }
2699e201c85SYohann }
2709e201c85SYohann }
2719e201c85SYohann
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)2722b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
27302219a08SJeremy L Thompson void GradCollocated(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
27402219a08SJeremy L Thompson CeedScalar *__restrict__ d_V) {
27502219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
27602219a08SJeremy L Thompson
27702219a08SJeremy L Thompson SharedData_Hip data;
27802219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
27902219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
28002219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
28102219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
28202219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
28302219a08SJeremy L Thompson
28402219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
28502219a08SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
28602219a08SJeremy L Thompson
28702219a08SJeremy L Thompson // load interp_1d and grad_1d into shared memory
28802219a08SJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
28902219a08SJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
29002219a08SJeremy L Thompson __syncthreads();
29102219a08SJeremy L Thompson
29202219a08SJeremy L Thompson // Apply basis element by element
29302219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
29402219a08SJeremy L Thompson if (BASIS_DIM == 1) {
29502219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
29602219a08SJeremy L Thompson Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
29702219a08SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
29802219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
29902219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
3000ccda8ebSJeremy L Thompson GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
30102219a08SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
30202219a08SJeremy L Thompson d_V);
30302219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
30402219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
30502219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
3060ccda8ebSJeremy L Thompson GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
30702219a08SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
30802219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
30902219a08SJeremy L Thompson }
31002219a08SJeremy L Thompson }
31102219a08SJeremy L Thompson }
31202219a08SJeremy L Thompson
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)31302219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
314aa4002adSJeremy L Thompson void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
3159e201c85SYohann CeedScalar *__restrict__ d_V) {
3169e201c85SYohann extern __shared__ CeedScalar slice[];
317b2165e7aSSebastian Grimberg
3189e201c85SYohann SharedData_Hip data;
3199e201c85SYohann data.t_id_x = threadIdx.x;
3209e201c85SYohann data.t_id_y = threadIdx.y;
3219e201c85SYohann data.t_id_z = threadIdx.z;
3229e201c85SYohann data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
3236b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
3249e201c85SYohann
3259e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
3269e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
3279e201c85SYohann
328aa4002adSJeremy L Thompson // load interp_1d and grad_1d into shared memory
329aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
330aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
331aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
332aa4002adSJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
333aa4002adSJeremy L Thompson __syncthreads();
334aa4002adSJeremy L Thompson
335aa4002adSJeremy L Thompson // Apply basis element by element
3369e201c85SYohann for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
3379e201c85SYohann if (BASIS_DIM == 1) {
3389e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
3396b92dc4bSJeremy L Thompson GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3409e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
3419e201c85SYohann } else if (BASIS_DIM == 2) {
3422b730f8bSJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
3432b730f8bSJeremy L Thompson r_U);
3446b92dc4bSJeremy L Thompson GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3459e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3469e201c85SYohann } else if (BASIS_DIM == 3) {
3472b730f8bSJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
3482b730f8bSJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
3496b92dc4bSJeremy L Thompson if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3506b92dc4bSJeremy L Thompson else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
3512b730f8bSJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
3522b730f8bSJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
3539e201c85SYohann }
3549e201c85SYohann }
3559e201c85SYohann }
3569e201c85SYohann
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)357db2becc9SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
35802219a08SJeremy L Thompson void GradCollocatedTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
35902219a08SJeremy L Thompson CeedScalar *__restrict__ d_V) {
36002219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
36102219a08SJeremy L Thompson
36202219a08SJeremy L Thompson SharedData_Hip data;
36302219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
36402219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
36502219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
36602219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
36702219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
36802219a08SJeremy L Thompson
36902219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
37002219a08SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
37102219a08SJeremy L Thompson
37202219a08SJeremy L Thompson // load interp_1d and grad_1d into shared memory
37302219a08SJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
37402219a08SJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
37502219a08SJeremy L Thompson __syncthreads();
37602219a08SJeremy L Thompson
37702219a08SJeremy L Thompson // Apply basis element by element
37802219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
37902219a08SJeremy L Thompson if (BASIS_DIM == 1) {
38002219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
38102219a08SJeremy L Thompson GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
38202219a08SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
38302219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
38402219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
38502219a08SJeremy L Thompson r_U);
3860ccda8ebSJeremy L Thompson GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
38702219a08SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
38802219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
38902219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
39002219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
3910ccda8ebSJeremy L Thompson GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
39202219a08SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
39302219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
39402219a08SJeremy L Thompson }
39502219a08SJeremy L Thompson }
39602219a08SJeremy L Thompson }
39702219a08SJeremy L Thompson
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)39802219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
399aa4002adSJeremy L Thompson void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
400db2becc9SJeremy L Thompson CeedScalar *__restrict__ d_V) {
401db2becc9SJeremy L Thompson extern __shared__ CeedScalar slice[];
402db2becc9SJeremy L Thompson
403db2becc9SJeremy L Thompson SharedData_Hip data;
404db2becc9SJeremy L Thompson data.t_id_x = threadIdx.x;
405db2becc9SJeremy L Thompson data.t_id_y = threadIdx.y;
406db2becc9SJeremy L Thompson data.t_id_z = threadIdx.z;
407db2becc9SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4086b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
409db2becc9SJeremy L Thompson
410db2becc9SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
411db2becc9SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
412db2becc9SJeremy L Thompson
413aa4002adSJeremy L Thompson // load interp_1d and grad_1d into shared memory
414aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
415aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B);
416aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
417aa4002adSJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
418aa4002adSJeremy L Thompson __syncthreads();
419aa4002adSJeremy L Thompson
420aa4002adSJeremy L Thompson // Apply basis element by element
421db2becc9SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
422db2becc9SJeremy L Thompson if (BASIS_DIM == 1) {
423db2becc9SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
4246b92dc4bSJeremy L Thompson GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
425db2becc9SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
426db2becc9SJeremy L Thompson } else if (BASIS_DIM == 2) {
427db2becc9SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
428db2becc9SJeremy L Thompson r_U);
4296b92dc4bSJeremy L Thompson GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
430db2becc9SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
431db2becc9SJeremy L Thompson } else if (BASIS_DIM == 3) {
432db2becc9SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
433db2becc9SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
4346b92dc4bSJeremy L Thompson if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
4356b92dc4bSJeremy L Thompson else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V);
436db2becc9SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
437db2becc9SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
438db2becc9SJeremy L Thompson }
439db2becc9SJeremy L Thompson }
440db2becc9SJeremy L Thompson }
441db2becc9SJeremy L Thompson
__launch_bounds__(BASIS_GRAD_BLOCK_SIZE)44202219a08SJeremy L Thompson extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
44302219a08SJeremy L Thompson void GradCollocatedTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
44402219a08SJeremy L Thompson CeedScalar *__restrict__ d_V) {
44502219a08SJeremy L Thompson extern __shared__ CeedScalar slice[];
44602219a08SJeremy L Thompson
44702219a08SJeremy L Thompson SharedData_Hip data;
44802219a08SJeremy L Thompson data.t_id_x = threadIdx.x;
44902219a08SJeremy L Thompson data.t_id_y = threadIdx.y;
45002219a08SJeremy L Thompson data.t_id_z = threadIdx.z;
45102219a08SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
45202219a08SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
45302219a08SJeremy L Thompson
45402219a08SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
45502219a08SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
45602219a08SJeremy L Thompson
45702219a08SJeremy L Thompson // load interp_1d and grad_1d into shared memory
45802219a08SJeremy L Thompson __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
45902219a08SJeremy L Thompson LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G);
46002219a08SJeremy L Thompson __syncthreads();
46102219a08SJeremy L Thompson
46202219a08SJeremy L Thompson // Apply basis element by element
46302219a08SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
46402219a08SJeremy L Thompson if (BASIS_DIM == 1) {
46502219a08SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
46602219a08SJeremy L Thompson GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
46702219a08SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
46802219a08SJeremy L Thompson } else if (BASIS_DIM == 2) {
46902219a08SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
47002219a08SJeremy L Thompson r_U);
4710ccda8ebSJeremy L Thompson GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
47202219a08SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
47302219a08SJeremy L Thompson } else if (BASIS_DIM == 3) {
47402219a08SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
47502219a08SJeremy L Thompson BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
4760ccda8ebSJeremy L Thompson GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
47702219a08SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
47802219a08SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
47902219a08SJeremy L Thompson }
48002219a08SJeremy L Thompson }
48102219a08SJeremy L Thompson }
48202219a08SJeremy L Thompson
4839e201c85SYohann //------------------------------------------------------------------------------
4849e201c85SYohann // Weight kernels by dim
4859e201c85SYohann //------------------------------------------------------------------------------
__launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE)4862b730f8bSJeremy L Thompson extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
4872b730f8bSJeremy L Thompson void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
4889e201c85SYohann extern __shared__ CeedScalar slice[];
4899e201c85SYohann
4909e201c85SYohann SharedData_Hip data;
4919e201c85SYohann data.t_id_x = threadIdx.x;
4929e201c85SYohann data.t_id_y = threadIdx.y;
4939e201c85SYohann data.t_id_z = threadIdx.z;
4949e201c85SYohann data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
4956b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1);
4969e201c85SYohann
4979e201c85SYohann CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
4989e201c85SYohann
4999e201c85SYohann for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
5009e201c85SYohann if (BASIS_DIM == 1) {
501ca595be6SJeremy L Thompson Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5029e201c85SYohann WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
5039e201c85SYohann } else if (BASIS_DIM == 2) {
504ca595be6SJeremy L Thompson WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5059e201c85SYohann WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
5069e201c85SYohann } else if (BASIS_DIM == 3) {
507ca595be6SJeremy L Thompson WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W);
5082b730f8bSJeremy L Thompson WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
5092b730f8bSJeremy L Thompson d_W);
5109e201c85SYohann }
5119e201c85SYohann }
5129e201c85SYohann }
513