19ff05d55SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29ff05d55SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39ff05d55SJeremy L Thompson // 49ff05d55SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59ff05d55SJeremy L Thompson // 69ff05d55SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79ff05d55SJeremy L Thompson 89ff05d55SJeremy L Thompson /// @file 99ff05d55SJeremy L Thompson /// Internal header for CUDA shared memory non-tensor basis 109ff05d55SJeremy L Thompson #include <ceed/types.h> 119ff05d55SJeremy L Thompson 129ff05d55SJeremy L Thompson #include "cuda-shared-basis-nontensor-templates.h" 139ff05d55SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 149ff05d55SJeremy L Thompson 159ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 169ff05d55SJeremy L Thompson // Interp kernel by dim 179ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 189ff05d55SJeremy L Thompson extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 199ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 209ff05d55SJeremy L Thompson 219ff05d55SJeremy L Thompson SharedData_Cuda data; 229ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 239ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 249ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 259ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 269ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 279ff05d55SJeremy L Thompson 289ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 299ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 309ff05d55SJeremy L Thompson 31*aa4002adSJeremy L Thompson // load interp into shared memory 32*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 33*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 34*aa4002adSJeremy L Thompson __syncthreads(); 35*aa4002adSJeremy L Thompson 36*aa4002adSJeremy L Thompson // Apply basis element by element 379ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 389ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 39*aa4002adSJeremy L Thompson InterpNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 409ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 419ff05d55SJeremy L Thompson } 429ff05d55SJeremy L Thompson } 439ff05d55SJeremy L Thompson 449ff05d55SJeremy L Thompson extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 459ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 469ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 479ff05d55SJeremy L Thompson 489ff05d55SJeremy L Thompson SharedData_Cuda data; 499ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 509ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 519ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 529ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 539ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 549ff05d55SJeremy L Thompson 559ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 569ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 579ff05d55SJeremy L Thompson 58*aa4002adSJeremy L Thompson // load interp into shared memory 59*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 60*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 61*aa4002adSJeremy L Thompson __syncthreads(); 62*aa4002adSJeremy L Thompson 63*aa4002adSJeremy L Thompson // Apply basis element by element 649ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 659ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 66*aa4002adSJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 679ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 689ff05d55SJeremy L Thompson } 699ff05d55SJeremy L Thompson } 709ff05d55SJeremy L Thompson 719ff05d55SJeremy L Thompson extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 729ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 739ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 749ff05d55SJeremy L Thompson 759ff05d55SJeremy L Thompson SharedData_Cuda data; 769ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 779ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 789ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 799ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 809ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 819ff05d55SJeremy L Thompson 829ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 839ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 849ff05d55SJeremy L Thompson 85*aa4002adSJeremy L Thompson // load interp into shared memory 86*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P * BASIS_Q]; 87*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q>(data, c_B, s_B); 88*aa4002adSJeremy L Thompson __syncthreads(); 89*aa4002adSJeremy L Thompson 90*aa4002adSJeremy L Thompson // Apply basis element by element 919ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 929ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 93*aa4002adSJeremy L Thompson InterpTransposeNonTensor<BASIS_NUM_COMP, BASIS_P, BASIS_Q>(data, r_U, s_B, r_V); 949ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 959ff05d55SJeremy L Thompson } 969ff05d55SJeremy L Thompson } 979ff05d55SJeremy L Thompson 989ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 999ff05d55SJeremy L Thompson // Grad kernel by dim 1009ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1019ff05d55SJeremy L Thompson extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 1029ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1039ff05d55SJeremy L Thompson 1049ff05d55SJeremy L Thompson SharedData_Cuda data; 1059ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1069ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1079ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1089ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 1099ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 1109ff05d55SJeremy L Thompson 1119ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 1129ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 1139ff05d55SJeremy L Thompson 114*aa4002adSJeremy L Thompson // load grad into shared memory 115*aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 116*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 117*aa4002adSJeremy L Thompson __syncthreads(); 118*aa4002adSJeremy L Thompson 119*aa4002adSJeremy L Thompson // Apply basis element by element 1209ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1219ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, d_U, r_U); 122*aa4002adSJeremy L Thompson GradNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 1239ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_V, d_V); 1249ff05d55SJeremy L Thompson } 1259ff05d55SJeremy L Thompson } 1269ff05d55SJeremy L Thompson 1279ff05d55SJeremy L Thompson extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 1289ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 1299ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1309ff05d55SJeremy L Thompson 1319ff05d55SJeremy L Thompson SharedData_Cuda data; 1329ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1339ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1349ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1359ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 1369ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 1379ff05d55SJeremy L Thompson 1389ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 1399ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 1409ff05d55SJeremy L Thompson 141*aa4002adSJeremy L Thompson // load grad into shared memory 142*aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 143*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 144*aa4002adSJeremy L Thompson __syncthreads(); 145*aa4002adSJeremy L Thompson 146*aa4002adSJeremy L Thompson // Apply basis element by element 1479ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1489ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 149*aa4002adSJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 1509ff05d55SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 1519ff05d55SJeremy L Thompson } 1529ff05d55SJeremy L Thompson } 1539ff05d55SJeremy L Thompson 1549ff05d55SJeremy L Thompson extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 1559ff05d55SJeremy L Thompson CeedScalar *__restrict__ d_V) { 1569ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1579ff05d55SJeremy L Thompson 1589ff05d55SJeremy L Thompson SharedData_Cuda data; 1599ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1609ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1619ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1629ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 1639ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 1649ff05d55SJeremy L Thompson 1659ff05d55SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 1669ff05d55SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 1679ff05d55SJeremy L Thompson 168*aa4002adSJeremy L Thompson // load grad into shared memory 169*aa4002adSJeremy L Thompson __shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM]; 170*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G); 171*aa4002adSJeremy L Thompson __syncthreads(); 172*aa4002adSJeremy L Thompson 173*aa4002adSJeremy L Thompson // Apply basis element by element 1749ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1759ff05d55SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, d_U, r_U); 176*aa4002adSJeremy L Thompson GradTransposeNonTensor<BASIS_NUM_COMP, BASIS_DIM, BASIS_P, BASIS_Q>(data, r_U, s_G, r_V); 1779ff05d55SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P>(data, elem, 1, BASIS_P * num_elem, BASIS_P, r_V, d_V); 1789ff05d55SJeremy L Thompson } 1799ff05d55SJeremy L Thompson } 1809ff05d55SJeremy L Thompson 1819ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1829ff05d55SJeremy L Thompson // Weight kernels by dim 1839ff05d55SJeremy L Thompson //------------------------------------------------------------------------------ 1849ff05d55SJeremy L Thompson extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { 1859ff05d55SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1869ff05d55SJeremy L Thompson 1879ff05d55SJeremy L Thompson SharedData_Cuda data; 1889ff05d55SJeremy L Thompson data.t_id_x = threadIdx.x; 1899ff05d55SJeremy L Thompson data.t_id_y = threadIdx.y; 1909ff05d55SJeremy L Thompson data.t_id_z = threadIdx.z; 1919ff05d55SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 1929ff05d55SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D; 1939ff05d55SJeremy L Thompson 1949ff05d55SJeremy L Thompson CeedScalar r_W[1]; 1959ff05d55SJeremy L Thompson 1969ff05d55SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1979ff05d55SJeremy L Thompson WeightNonTensor<BASIS_Q>(data, q_weight, r_W); 1989ff05d55SJeremy L Thompson WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W); 1999ff05d55SJeremy L Thompson } 2009ff05d55SJeremy L Thompson } 201