19ff05d55SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29ff05d55SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39ff05d55SJeremy L Thompson // 49ff05d55SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59ff05d55SJeremy L Thompson // 69ff05d55SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79ff05d55SJeremy L Thompson 89ff05d55SJeremy L Thompson /// @file 99ff05d55SJeremy L Thompson /// Internal header for CUDA shared memory non-tensor basis 109ff05d55SJeremy L Thompson #include <ceed/types.h> 119ff05d55SJeremy L Thompson 129ff05d55SJeremy L Thompson #include "cuda-shared-basis-nontensor-templates.h" 139ff05d55SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 149ff05d55SJeremy L Thompson 159ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 166c13bbcbSJeremy L Thompson // Interp kernels 179ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 189ff05d55SJeremy L Thompson extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 199ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 209ff05d55SJeremy L Thompson 219ff05d55SJeremy L Thompson SharedData_Cuda data; 229ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 239ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 249ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 259ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 26*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 279ff05d55SJeremy L Thompson 289ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 299ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 309ff05d55SJeremy L Thompson 31aa4002adSJeremy L Thompson // load interp into shared memory 32aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 33aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 34aa4002adSJeremy L Thompson __syncthreads(); 35aa4002adSJeremy L Thompson 36aa4002adSJeremy L Thompson // Apply basis element by element 379ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 389ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 39*99421279SJeremy L Thompson InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V); 409ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 419ff05d55SJeremy L Thompson } 429ff05d55SJeremy L Thompson } 439ff05d55SJeremy L Thompson 449ff05d55SJeremy L Thompson extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 459ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 469ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 479ff05d55SJeremy L Thompson 489ff05d55SJeremy L Thompson SharedData_Cuda data; 499ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 509ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 519ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 529ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 53*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 549ff05d55SJeremy L Thompson 559ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 569ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 579ff05d55SJeremy L Thompson 58aa4002adSJeremy L Thompson // load interp into shared memory 59aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 60aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 61aa4002adSJeremy L Thompson __syncthreads(); 62aa4002adSJeremy L Thompson 63aa4002adSJeremy L Thompson // Apply basis element by element 649ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 659ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 66*99421279SJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V); 679ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 689ff05d55SJeremy L Thompson } 699ff05d55SJeremy L Thompson } 709ff05d55SJeremy L Thompson 719ff05d55SJeremy L Thompson extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 729ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 739ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 749ff05d55SJeremy L Thompson 759ff05d55SJeremy L Thompson SharedData_Cuda data; 769ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 779ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 789ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 799ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 80*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 819ff05d55SJeremy L Thompson 829ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 839ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 849ff05d55SJeremy L Thompson 85aa4002adSJeremy L Thompson // load interp into shared memory 86aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 87aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 88aa4002adSJeremy L Thompson __syncthreads(); 89aa4002adSJeremy L Thompson 90aa4002adSJeremy L Thompson // Apply basis element by element 919ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 929ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 93*99421279SJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_B, r_V); 949ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 959ff05d55SJeremy L Thompson } 969ff05d55SJeremy L Thompson } 979ff05d55SJeremy L Thompson 989ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 996c13bbcbSJeremy L Thompson // Grad kernels 1009ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1019ff05d55SJeremy L Thompson extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 1029ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1039ff05d55SJeremy L Thompson 1049ff05d55SJeremy L Thompson SharedData_Cuda data; 1059ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1069ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1079ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1089ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 109*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 1109ff05d55SJeremy L Thompson 1119ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 1129ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 1139ff05d55SJeremy L Thompson 114aa4002adSJeremy L Thompson // load grad into shared memory 115aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 116aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 117aa4002adSJeremy L Thompson __syncthreads(); 118aa4002adSJeremy L Thompson 119aa4002adSJeremy L Thompson // Apply basis element by element 1209ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1219ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 122*99421279SJeremy L Thompson GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V); 1239ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 1249ff05d55SJeremy L Thompson } 1259ff05d55SJeremy L Thompson } 1269ff05d55SJeremy L Thompson 1279ff05d55SJeremy L Thompson extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 1289ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 1299ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1309ff05d55SJeremy L Thompson 1319ff05d55SJeremy L Thompson SharedData_Cuda data; 1329ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1339ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1349ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1359ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 136*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 1379ff05d55SJeremy L Thompson 1389ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 1399ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 1409ff05d55SJeremy L Thompson 141aa4002adSJeremy L Thompson // load grad into shared memory 142aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 143aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 144aa4002adSJeremy L Thompson __syncthreads(); 145aa4002adSJeremy L Thompson 146aa4002adSJeremy L Thompson // Apply basis element by element 1479ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1489ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 149*99421279SJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V); 1509ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 1519ff05d55SJeremy L Thompson } 1529ff05d55SJeremy L Thompson } 1539ff05d55SJeremy L Thompson 1549ff05d55SJeremy L Thompson extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 1559ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 1569ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1579ff05d55SJeremy L Thompson 1589ff05d55SJeremy L Thompson SharedData_Cuda data; 1599ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1609ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1619ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1629ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 163*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 1649ff05d55SJeremy L Thompson 1659ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 1669ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 1679ff05d55SJeremy L Thompson 168aa4002adSJeremy L Thompson // load grad into shared memory 169aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 170aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 171aa4002adSJeremy L Thompson __syncthreads(); 172aa4002adSJeremy L Thompson 173aa4002adSJeremy L Thompson // Apply basis element by element 1749ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1759ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 176*99421279SJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q, BASIS_T_1D>(data, r_U, s_G, r_V); 1779ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 1789ff05d55SJeremy L Thompson } 1799ff05d55SJeremy L Thompson } 1809ff05d55SJeremy L Thompson 1819ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1826c13bbcbSJeremy L Thompson // Weight kernel 1839ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1849ff05d55SJeremy L Thompson extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { 1859ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1869ff05d55SJeremy L Thompson 1879ff05d55SJeremy L Thompson SharedData_Cuda data; 1889ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1899ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1909ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1919ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 192*99421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D; 1939ff05d55SJeremy L Thompson 1949ff05d55SJeremy L Thompson CeedScalar r_W[1]; 1959ff05d55SJeremy L Thompson 1969ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1979ff05d55SJeremy L Thompson WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 1989ff05d55SJeremy L Thompson WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 1999ff05d55SJeremy L Thompson } 2009ff05d55SJeremy L Thompson } 201